Python Scheduler Examples, scripts.schedulers.scheduler_factory.Scheduler Python Examples

Example #1

0

Show file

File: platform_config.py Project: ucgmsim/slurm_gm_workflow

def get_platform_specific_script(process: ProcessType,
                                 arguments: Dict[str, str]) -> str:
    """
    Returns the path to the script with arguments correctly formatted for the scheduler
    :param process: The process to get the script for
    :param arguments: Any arguments to be passed to the script
    :return: The string representing the path to the script with the appropriate arguments to run it
    """

    # To prevent circular dependency
    from scripts.schedulers.scheduler_factory import Scheduler

    scheduler = Scheduler.get_scheduler()

    platform_dir = f"{platform.name.lower()}_scripts"
    script_extension = scheduler.SCRIPT_EXTENSION
    script_name = {
        ProcessType.rrup: "calc_rrups_single",
        ProcessType.clean_up: "clean_up",
        ProcessType.HF2BB: "hf2bb",
        ProcessType.LF2BB: "lf2bb",
        ProcessType.plot_srf: "plot_srf",
        ProcessType.plot_ts: "plot_ts",
    }[process]

    return scheduler.process_arguments(
        join(WORKFLOW_DIR, "scripts", platform_dir,
             f"{script_name}.{script_extension}"),
        arguments,
    )

Example #2

0

Show file

File: E2ETests.py Project: ucgmsim/slurm_gm_workflow

    def cancel_running(self, proc_types: List[const.ProcessType]):
        """Looks for any running task of the specified process types
        and attempts to cancel one of each.
        """
        # Get all running jobs in the mgmt db
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))
        entries = db.command_builder(allowed_tasks=proc_types,
                                     allowed_states=[const.Status.running])

        # Cancel one for each process type
        for entry in entries:
            if entry.proc_type in proc_types:
                print(
                    f"Checkpoint testing: Cancelling job-id {entry.job_id} "
                    "for {entry.run_name} and process type {entry.proc_type}")

                out, err = Scheduler.get_scheduler().cancel_job(entry.job_id)

                print("Scancel out: ", out, err)
                if "error" not in out.lower() and "error" not in err.lower():
                    self.canceled_running.append(str(entry.job_id))
                    proc_types.remove(entry.proc_type)
                    print("Cancelled job-id {}".format(entry.job_id))

        return proc_types

Example #3

0

Show file

File: shared_automated_workflow.py Project: ucgmsim/slurm_gm_workflow

def submit_script_to_scheduler(
        script: str,
        proc_type: int,
        queue_folder: str,
        sim_dir: str,
        run_name: str,
        target_machine: str = None,
        logger: Logger = get_basic_logger(),
):
    """
    Submits the slurm script and updates the management db.
    Calling the scheduler submitter may result in an error being raised.
    This is not caught in order to get immediate attention of broken runs.
    :param sim_dir:
    :param script: The location of the script to be run
    :param proc_type: The process type of the job being run
    :param queue_folder: Where the folder for database updates is
    :param run_name: The name of the realisation
    :param target_machine: The
    :param logger:
    :return:
    """
    job_id = Scheduler.get_scheduler().submit_job(sim_dir, script,
                                                  target_machine)

    add_to_queue(
        queue_folder,
        run_name,
        proc_type,
        const.Status.queued.value,
        job_id=job_id,
        logger=logger,
    )

Example #4

0

Show file

File: submit_empirical.py Project: ucgmsim/slurm_gm_workflow

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("cybershake_folder",
                        help="Path to Cybershake root folder")
    parser.add_argument("-i", "--identifiers", nargs="+", help="realisation")
    parser.add_argument(
        "-e",
        "--extended_period",
        action="store_const",
        const="-e",
        default="",
        help="indicates extended pSA period to be calculated if present",
    )
    parser.add_argument(
        "-np",
        default=40,
        help="number of processes to use. Currently overridden to 1")
    parser.add_argument(
        "--account",
        default=platform_config[const.PLATFORM_CONFIG.DEFAULT_ACCOUNT],
        help="specify the NeSI project",
    )
    parser.add_argument("-o", "--output_dir", type=os.path.abspath())

    args = parser.parse_args()

    # The name parameter is only used to check user tasks in the queue monitor
    Scheduler.initialise_scheduler("", args.account)

    generate_empirical_script(
        args.np,
        args.extended_period,
        args.cybershake_folder,
        args.identifiers,
        args.output_dir,
    )

Example #5

0

Show file

    )
    parser.add_argument("--auto", nargs="?", type=str, const=True)
    parser.add_argument(
        "--account",
        type=str,
        default=platform_config[const.PLATFORM_CONFIG.DEFAULT_ACCOUNT.name],
    )
    parser.add_argument("--srf", type=str, default=None)
    parser.add_argument(
        "--machine",
        type=str,
        default=host,
        help="The machine emod3d is to be submitted to.",
    )
    parser.add_argument(
        "--write_directory",
        type=str,
        help="The directory to write the slurm script to.",
        default=None,
    )
    parser.add_argument("--rel_dir",
                        default=".",
                        type=str,
                        help="The path to the realisation directory")
    args = parser.parse_args()

    # The name parameter is only used to check user tasks in the queue monitor
    Scheduler.initialise_scheduler("", args.account)

    main(args)

Example #6

0

Show file

def update_tasks(
    mgmt_queue_entries: List[str],
    squeue_tasks: Dict[str, str],
    db_running_tasks: List[SchedulerTask],
    complete_data: bool,
    task_logger: Logger,
    root_folder: str,
):
    """Updates the mgmt db entries based on the HPC queue"""
    tasks_to_do = []

    task_logger.debug("Checking running tasks in the db for updates")
    task_logger.debug(
        f"The key value pairs found in {Scheduler.get_scheduler().QUEUE_NAME} are as follows: {squeue_tasks.items()}"
    )
    for db_running_task in db_running_tasks:
        task_logger.debug("Checking task {}".format(db_running_task))
        if str(db_running_task.job_id) in squeue_tasks.keys():

            queue_status = squeue_tasks[str(db_running_task.job_id)]
            task_logger.debug("Found task. It has state {}".format(queue_status))

            try:
                queue_status = Scheduler.get_scheduler().STATUS_DICT[queue_status]
            except KeyError:
                task_logger.error(
                    "Failed to recognize state code {}, updating to {}".format(
                        queue_status, const.Status.unknown.value
                    )
                )
                queue_status = const.Status.unknown.value
            task_logger.debug("This state represents status {}".format(queue_status))

            if queue_status == db_running_task.status:
                task_logger.debug(
                    "No need to update status {} for {}, {} ({}) as it "
                    "has not changed.".format(
                        const.Status(queue_status).str_value,
                        db_running_task.run_name,
                        const.ProcessType(db_running_task.proc_type).str_value,
                        db_running_task.job_id,
                    )
                )
            elif not check_mgmt_queue(
                mgmt_queue_entries,
                db_running_task.run_name,
                db_running_task.proc_type,
                logger=task_logger,
            ):
                task_logger.info(
                    "Updating status of {}, {} from {} to {}".format(
                        db_running_task.run_name,
                        const.ProcessType(db_running_task.proc_type).str_value,
                        const.Status(db_running_task.status).str_value,
                        const.Status(queue_status).str_value,
                    )
                )
                tasks_to_do.append(
                    SchedulerTask(
                        db_running_task.run_name,
                        db_running_task.proc_type,
                        queue_status,
                        None,
                        None,
                    )
                )
            else:
                # Do nothing if there is a pending update for
                # this run & process type combination
                pass
        # Only reset if there is no entry on the mgmt queue for this
        # realisation/proc combination and nothing in the mgmt folder
        elif not check_mgmt_queue(
            mgmt_queue_entries,
            db_running_task.run_name,
            db_running_task.proc_type,
            logger=task_logger,
        ):
            if not complete_data:
                task_logger.warning(
                    f"Task '{const.ProcessType(db_running_task.proc_type).str_value}' not found on "
                    f"{Scheduler.get_scheduler().QUEUE_NAME} or in the management db folder, "
                    f"but errors were encountered when querying {Scheduler.get_scheduler().QUEUE_NAME}. Not resubmitting."
                )
            else:
                task_logger.warning(
                    f"Task '{const.ProcessType(db_running_task.proc_type).str_value}' on '{db_running_task.run_name}' "
                    f"not found on {Scheduler.get_scheduler().QUEUE_NAME} or in the management db folder; resetting the status "
                    "to 'created' for resubmission"
                )
                # Add an error
                tasks_to_do.append(
                    SchedulerTask(
                        db_running_task.run_name,
                        db_running_task.proc_type,
                        const.Status.failed.value,
                        None,
                        f"Disappeared from {Scheduler.get_scheduler().QUEUE_NAME}. Creating a new task.",
                    )
                )
            # When job failed, we want to log metadata as well
            (
                start_time,
                end_time,
                run_time,
                n_cores,
                status,
            ) = Scheduler.get_scheduler().get_metadata(db_running_task, task_logger)
            log_file = os.path.join(
                sim_struct.get_sim_dir(root_folder, db_running_task.run_name),
                "ch_log",
                "metadata_log.json",
            )
            # now log metadata
            store_metadata(
                log_file,
                const.ProcessType(db_running_task.proc_type).str_value,
                {
                    "start_time": start_time,
                    "end_time": end_time,
                    "run_time": run_time,
                    "cores": n_cores,
                    "status": status,
                },
                logger=task_logger,
            )
    return tasks_to_do

Example #7

0

Show file

def queue_monitor_loop(
    root_folder: str,
    sleep_time: int,
    max_retries: int,
    queue_logger: Logger = qclogging.get_basic_logger(),
    alert_url=None,
):
    mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder))
    queue_folder = sim_struct.get_mgmt_db_queue(root_folder)

    queue_logger.info("Running queue-monitor, exit with Ctrl-C.")

    mgmt_db.add_retries(max_retries)

    sqlite_tmpdir = "/tmp/cer"
    while keepAlive:
        complete_data = True
        if not os.path.exists(sqlite_tmpdir):
            os.makedirs(sqlite_tmpdir)
            queue_logger.debug("Set up the sqlite_tmpdir")

        # For each hpc get a list of job id and status', and for each pair save them in a dictionary
        queued_tasks = {}
        for hpc in HPC:
            try:
                squeued_tasks = Scheduler.get_scheduler().check_queues(
                    user=False, target_machine=hpc
                )
            except EnvironmentError as e:
                queue_logger.critical(e)
                queue_logger.critical(
                    f"An error was encountered when attempting to check {Scheduler.get_scheduler().QUEUE_NAME} for HPC {hpc}. "
                    "Tasks will not be submitted to this HPC until the issue is resolved"
                )
                complete_data = False
            else:
                for task in squeued_tasks:
                    queued_tasks[task.split()[0]] = task.split()[1]

        if len(queued_tasks) > 0:
            if len(queued_tasks) > 200:
                queue_logger.log(
                    VERYVERBOSE,
                    f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}",
                )
                queue_logger.info(
                    f"Over 200 tasks were found in the queue. Check the log for an exact listing of them"
                )
            else:
                queue_logger.info(
                    f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}"
                )
        else:
            queue_logger.debug(f"No {Scheduler.get_scheduler().QUEUE_NAME} tasks")

        db_in_progress_tasks = mgmt_db.get_submitted_tasks()
        if len(db_in_progress_tasks) > 0:

            queue_logger.info(
                "In progress tasks in mgmt db:"
                + ", ".join(
                    [
                        "{}-{}-{}-{}".format(
                            entry.run_name,
                            const.ProcessType(entry.proc_type).str_value,
                            entry.job_id,
                            const.Status(entry.status).str_value,
                        )
                        for entry in db_in_progress_tasks
                    ]
                )
            )

        entry_files = os.listdir(queue_folder)
        entry_files.sort()

        entries = []

        for file_name in entry_files[::-1]:
            queue_logger.debug(
                "Checking {} to see if it is a valid update file".format(file_name)
            )
            entry = get_queue_entry(os.path.join(queue_folder, file_name), queue_logger)
            if entry is None:
                queue_logger.debug(
                    "Removing {} from the list of update files".format(file_name)
                )
                entry_files.remove(file_name)
            else:
                if str(entry.job_id) in queued_tasks.keys() and entry.status > 3:
                    # This will prevent race conditions if the failure/completion state file is made and picked up before the job actually finishes
                    # Most notabley happens on Kisti
                    # The queued and running states are allowed
                    queue_logger.debug(
                        "Job {} is still running on the HPC, skipping this iteration".format(
                            entry
                        )
                    )
                    entry_files.remove(file_name)
                else:
                    queue_logger.debug("Adding {} to the list of updates".format(entry))
                    entries.insert(0, entry)

        entries.extend(
            update_tasks(
                entry_files,
                queued_tasks,
                db_in_progress_tasks,
                complete_data,
                queue_logger,
                root_folder,
            )
        )

        if len(entries) > 0:
            queue_logger.info("Updating {} mgmt db tasks.".format(len(entries)))
            if mgmt_db.update_entries_live(entries, max_retries, queue_logger):
                for file_name in entry_files:
                    os.remove(os.path.join(queue_folder, file_name))
                # check for jobs that matches alert criteria
                if alert_url != None:
                    for entry in entries:
                        if entry.status == const.Status.failed.value:
                            entry_retries = mgmt_db.get_retries(
                                entry.proc_type, entry.run_name
                            )
                            if entry_retries < max_retries:
                                msg = f"fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error}"
                            elif entry_retries >= max_retries:
                                msg = f"@here fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error} and met the retry cap"
                            send_alert(msg, alert_url)
            else:
                queue_logger.error(
                    "Failed to update the current entries in the mgmt db queue. "
                    "Please investigate and fix. If this is a repeating error, then this "
                    "will block all other entries from updating."
                )
        else:
            queue_logger.info("No entries in the mgmt db queue.")

        # Nap time
        queue_logger.debug("Sleeping for {}".format(sleep_time))
        time.sleep(sleep_time)

Example #8

0

Show file

File: conftest.py Project: ucgmsim/slurm_gm_workflow

def init_scheduler():
    print("Initialising scheduler")
    Scheduler.initialise_scheduler("test_user")

Example #9

0

Show file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "root_folder",
        help="The root directory of the simulation folder structure")
    parser.add_argument("user", help="Your username")
    parser.add_argument(
        "config_file",
        help="The location of the config file containing everything to be run",
        nargs="?",
        default=join(
            platform_config[const.PLATFORM_CONFIG.TEMPLATES_DIR.name],
            "task_config.yaml",
        ),
    )
    parser.add_argument(
        "--sleep_time",
        type=int,
        help="Seconds sleeping between checking queue and adding more jobs",
        default=5,
    )
    parser.add_argument(
        "--n_max_retries",
        help="The maximum number of retries for any given task",
        default=2,
        type=int,
    )
    parser.add_argument(
        "--n_runs",
        default=None,
        type=int,
        nargs="+",
        help=
        "The number of processes each machine can run at once. If a single value is given this is used for all "
        "machines, otherwise one value per machine must be given. The current order is: {}"
        .format(list(x.value for x in HPC)),
    )
    parser.add_argument(
        "--log_folder",
        type=str,
        default=".",
        help=
        "Location of the directory to place logs in. Defaults to the value of the root_folder argument. "
        "Must be absolute or relative to the root_folder.",
    )
    parser.add_argument("--debug",
                        action="store_true",
                        help="Print debug messages to stdout")
    parser.add_argument("--alert_url",
                        help="the url to slack alert channel",
                        default=None)
    args = parser.parse_args()

    wrapper_logger = qclogging.get_logger(name="cybershake_wrapper",
                                          threaded=True)
    master_logger = qclogging.get_logger(name=None,
                                         threaded=True,
                                         stdout_printer=False)

    if args.debug:
        qclogging.set_stdout_level(wrapper_logger, DEBUG)

    root_directory = abspath(args.root_folder)
    log_directory = join(root_directory, args.log_folder)
    wrapper_log_file = join(
        log_directory,
        WRAPPER_LOG_FILE_NAME.format(datetime.now().strftime(
            const.TIMESTAMP_FORMAT)),
    )

    master_log_file = join(
        log_directory,
        MASTER_LOG_NAME.format(datetime.now().strftime(
            const.TIMESTAMP_FORMAT)),
    )
    scheduler_log_file = join(
        log_directory,
        SCHEDULER_LOG_NAME.format(datetime.now().strftime(
            const.TIMESTAMP_FORMAT)),
    )

    qclogging.add_general_file_handler(master_logger, master_log_file)
    qclogging.add_general_file_handler(wrapper_logger, wrapper_log_file)
    wrapper_logger.info("Logger file added")

    scheduler_logger = qclogging.get_logger(name="scheduler", threaded=True)
    qclogging.add_general_file_handler(scheduler_logger, scheduler_log_file)
    Scheduler.initialise_scheduler(user=args.user, logger=scheduler_logger)

    n_runs = 0
    if args.n_runs is not None:
        if len(args.n_runs) == 1:
            n_runs = {hpc: args.n_runs[0] for hpc in HPC}
            wrapper_logger.debug(
                "Using {} as the maximum number of jobs per machine".format(
                    args.n_runs[0]))
        elif len(args.n_runs) == len(HPC):
            n_runs = {}
            for index, hpc in enumerate(HPC):
                wrapper_logger.debug(
                    "Setting {} to have at most {} concurrently running jobs".
                    format(hpc, args.n_runs[index]))
                n_runs.update({hpc: args.n_runs[index]})
        else:
            incorrect_n_runs = (
                "You must specify wither one common value for --n_runs, or one "
                "for each in the following list: {}".format(
                    [hpc.name for hpc in HPC]))
            wrapper_logger.log(qclogging.NOPRINTCRITICAL, incorrect_n_runs)
            parser.error(incorrect_n_runs)
    else:
        n_runs = {
            HPC[hpc]:
            platform_config[const.PLATFORM_CONFIG.DEFAULT_N_RUNS.name][hpc]
            for hpc in platform_config[
                const.PLATFORM_CONFIG.AVAILABLE_MACHINES.name]
        }
    wrapper_logger.debug(
        "Machines will allow up to {} jobs to run simultaneously".format(
            n_runs))

    tasks_n, tasks_to_match = parse_config_file(args.config_file,
                                                wrapper_logger)

    run_automated_workflow(
        root_directory,
        log_directory,
        n_runs,
        args.n_max_retries,
        tasks_n,
        args.sleep_time,
        tasks_to_match,
        wrapper_logger,
        args.debug,
        alert_url=args.alert_url,
    )

Example #10

0

Show file

def main():
    logger = qclogging.get_logger()

    parser = argparse.ArgumentParser()

    parser.add_argument("root_folder",
                        type=str,
                        help="The cybershake root folder")
    parser.add_argument(
        "--n_runs",
        default=None,
        type=int,
        nargs="+",
        help=
        "The number of processes each machine can run at once. If a single value is given this is used for all "
        "machines, otherwise one value per machine must be given. The current order is: {}"
        .format((x.name for x in HPC)),
    )
    parser.add_argument(
        "user",
        type=str,
        help="The username under which the jobs will be submitted.")
    parser.add_argument(
        "--sleep_time",
        type=int,
        help="Seconds sleeping between checking queue and adding more jobs",
        default=5,
    )
    parser.add_argument(
        "--log_file",
        type=str,
        default=None,
        help=
        "Location of the log file to use. Defaults to 'cybershake_log.txt' in the location root_folder. "
        "Must be absolute or relative to the root_folder.",
    )
    parser.add_argument(
        "--task_types_to_run",
        nargs="+",
        help=
        "Which processes should be run. Defaults to IM_Calc and clean_up with dependencies automatically propagated",
        choices=[proc.str_value for proc in const.ProcessType],
        default=[const.ProcessType.clean_up.str_value],
    )
    parser.add_argument(
        "--rels_to_run",
        help=
        "An SQLite formatted query to match the realisations that should run.",
        default="%",
    )

    args = parser.parse_args()

    root_folder = os.path.abspath(args.root_folder)

    if args.log_file is None:
        qclogging.add_general_file_handler(
            logger,
            os.path.join(
                root_folder,
                AUTO_SUBMIT_LOG_FILE_NAME.format(datetime.now().strftime(
                    const.TIMESTAMP_FORMAT)),
            ),
        )
    else:
        qclogging.add_general_file_handler(
            logger, os.path.join(root_folder, args.log_file))
    logger.debug("Added file handler to the logger")

    logger.debug("Raw args passed in as follows: {}".format(str(args)))

    n_runs = 0
    if args.n_runs is not None:
        if len(args.n_runs) == 1:
            n_runs = {hpc: args.n_runs[0] for hpc in HPC}
            logger.debug(
                "Using {} as the maximum number of jobs per machine".format(
                    args.n_runs[0]))
        elif len(args.n_runs) == len(HPC):
            n_runs = {}
            for index, hpc in enumerate(HPC):
                logger.debug(
                    "Setting {} to have at most {} concurrently running jobs".
                    format(hpc, args.n_runs[index]))
                n_runs.update({hpc: args.n_runs[index]})
        else:
            logger.critical(
                "Expected either 1 or {} values for --n_runs, got {} values. Specifically: {}. Exiting now"
                .format(len(HPC), len(args.n_runs), args.n_runs))
            parser.error(
                "You must specify wither one common value for --n_runs, or one "
                "for each in the following list: {}".format(list(HPC)))
    else:
        n_runs = platform_config[const.PLATFORM_CONFIG.DEFAULT_N_RUNS.name]

    logger.debug(
        "Processes to be run were: {}. Getting all required dependencies now.".
        format(args.task_types_to_run))
    task_types_to_run = [
        const.ProcessType.from_str(proc) for proc in args.task_types_to_run
    ]
    for task in task_types_to_run:
        logger.debug(
            "Process {} in processes to be run, adding dependencies now.".
            format(task.str_value))
        for proc_num in task.get_remaining_dependencies(task_types_to_run):
            proc = const.ProcessType(proc_num)
            if proc not in task_types_to_run:
                logger.debug(
                    "Process {} added as a dependency of process {}".format(
                        proc.str_value, task.str_value))
                task_types_to_run.append(proc)

    mutually_exclusive_task_error = const.ProcessType.check_mutually_exclusive_tasks(
        task_types_to_run)
    if mutually_exclusive_task_error != "":
        logger.log(qclogging.NOPRINTCRITICAL, mutually_exclusive_task_error)
        parser.error(mutually_exclusive_task_error)

    logger.debug("Processed args are as follows: {}".format(str(args)))

    scheduler_logger = qclogging.get_logger(name=f"{logger.name}.scheduler")
    Scheduler.initialise_scheduler(user=args.user, logger=scheduler_logger)

    logger.info("Loading estimation models")
    lf_est_model = est.load_full_model(
        os.path.join(
            platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
            "LF"),
        logger=logger,
    )
    hf_est_model = est.load_full_model(
        os.path.join(
            platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
            "HF"),
        logger=logger,
    )
    bb_est_model = est.load_full_model(
        os.path.join(
            platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
            "BB"),
        logger=logger,
    )
    im_est_model = est.load_full_model(
        os.path.join(
            platform_config[const.PLATFORM_CONFIG.ESTIMATION_MODELS_DIR.name],
            "IM"),
        logger=logger,
    )

    run_main_submit_loop(
        root_folder,
        n_runs,
        args.rels_to_run,
        task_types_to_run,
        args.sleep_time,
        (lf_est_model, hf_est_model, bb_est_model, im_est_model),
        main_logger=logger,
    )

Example #11

0

Show file

def run_main_submit_loop(
        root_folder: str,
        n_runs: Dict[str, int],
        rels_to_run: str,
        given_tasks_to_run: List[const.ProcessType],
        sleep_time: int,
        models_tuple: Tuple[est.EstModel],
        main_logger: Logger = qclogging.get_basic_logger(),
        cycle_timeout=1,
):
    mgmt_queue_folder = sim_struct.get_mgmt_db_queue(root_folder)
    mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder))
    root_params_file = os.path.join(sim_struct.get_runs_dir(root_folder),
                                    "root_params.yaml")
    config = utils.load_yaml(root_params_file)
    main_logger.info("Loaded root params file: {}".format(root_params_file))
    # Default values

    hf_seed = config["hf"].get(const.RootParams.seed.value,
                               const.HF_DEFAULT_SEED)
    main_logger.debug("hf_seed set to {}".format(hf_seed))

    main_logger.debug(
        f"extended_period set to {config['ims']['extended_period']}")

    time_since_something_happened = cycle_timeout

    while time_since_something_happened > 0:
        main_logger.debug("time_since_something_happened is now {}".format(
            time_since_something_happened))
        time_since_something_happened -= 1
        # Get items in the mgmt queue, have to get a snapshot instead of
        # checking the directory real-time to prevent timing issues,
        # which can result in dual-submission
        mgmt_queue_entries = os.listdir(mgmt_queue_folder)

        # Get in progress tasks in the db and the HPC queue
        n_tasks_to_run = {}
        for hpc in HPC:
            try:
                squeued_tasks = Scheduler.get_scheduler().check_queues(
                    user=True, target_machine=hpc)
            except EnvironmentError as e:
                main_logger.critical(e)
                n_tasks_to_run[hpc] = 0
            else:
                n_tasks_to_run[hpc] = n_runs[hpc] - len(squeued_tasks)
                if len(squeued_tasks) > 0:
                    main_logger.debug(
                        "There was at least one job in squeue, resetting timeout"
                    )
                    time_since_something_happened = cycle_timeout

        # Gets all runnable tasks based on mgmt db state
        runnable_tasks = mgmt_db.get_runnable_tasks(
            rels_to_run,
            sum(n_runs.values()),
            os.listdir(sim_struct.get_mgmt_db_queue(root_folder)),
            given_tasks_to_run,
            main_logger,
        )
        if len(runnable_tasks) > 0:
            time_since_something_happened = cycle_timeout
            main_logger.info("Number of runnable tasks: {}".format(
                len(runnable_tasks)))
            main_logger.debug(
                "There was at least one runnable task, resetting timeout")
        else:
            main_logger.debug("No runnable_tasks")

        # Select the first ntask_to_run that are not waiting
        # for mgmt db updates (i.e. items in the queue)
        tasks_to_run, task_counter = [], {key: 0 for key in HPC}
        for cur_proc_type, cur_run_name, retries in runnable_tasks:

            cur_hpc = get_target_machine(cur_proc_type)
            # Add task if limit has not been reached and there are no
            # outstanding mgmt db updates
            if (not shared_automated_workflow.check_mgmt_queue(
                    mgmt_queue_entries, cur_run_name, cur_proc_type) and
                    task_counter.get(cur_hpc, 0) < n_tasks_to_run[cur_hpc]):
                tasks_to_run.append((cur_proc_type, cur_run_name, retries))
                task_counter[cur_hpc] += 1

            # Open to better suggestions
            # Break if enough tasks for each HPC have been added
            if np.all([
                    True if task_counter.get(hpc, 0) >= n_tasks_to_run[hpc]
                    else False for hpc in n_tasks_to_run.keys()
            ]):
                break

        if len(tasks_to_run) > 0:
            main_logger.info("Tasks to run this iteration: " + ", ".join([
                "{}-{}".format(entry[1],
                               const.ProcessType(entry[0]).str_value)
                for entry in tasks_to_run
            ]))
        else:
            main_logger.debug("No tasks to run this iteration")

        # Submit the runnable tasks
        for proc_type, run_name, retries in tasks_to_run:

            # Special handling for merge-ts
            if proc_type == const.ProcessType.merge_ts.value:
                # Check if clean up has already run
                if mgmt_db.is_task_complete([
                        const.ProcessType.clean_up.value,
                        run_name,
                        const.Status.completed.str_value,
                ]):
                    # If clean_up has already run, then we should set it to
                    # be run again after merge_ts has run
                    shared_automated_workflow.add_to_queue(
                        mgmt_queue_folder,
                        run_name,
                        const.ProcessType.clean_up.value,
                        const.Status.created.value,
                        logger=main_logger,
                    )

            # submit the job
            submit_task(
                sim_struct.get_sim_dir(root_folder, run_name),
                proc_type,
                run_name,
                root_folder,
                main_logger,
                retries=retries,
                hf_seed=hf_seed,
                models=models_tuple,
            )
        main_logger.debug("Sleeping for {} second(s)".format(sleep_time))
        time.sleep(sleep_time)
    main_logger.info(
        "Nothing was running or ready to run last cycle, exiting now")