Esempio n. 1
0
def test_cluster__create(cluster):
    assert cluster.config.num_jobs == 2
    assert cluster.config.submitted_jobs == 0
    assert cluster.config.completed_jobs == 0
    assert not cluster.all_jobs_submitted()
    assert cluster.has_submitter()
    assert cluster.am_i_submitter()
    assert cluster.config.version == 1
    assert cluster.job_status.version == 1
    cluster.demote_from_submitter()
    assert not cluster.am_i_submitter()
    assert cluster.config.version == 2

    cluster, _ = Cluster.deserialize(cluster.config.path)
    assert not cluster.has_submitter()
    assert not cluster.am_i_submitter()
    assert cluster.job_status is None
    assert cluster.config.num_jobs == 2
    assert cluster.config.submitted_jobs == 0
    assert cluster.config.completed_jobs == 0
    assert cluster.config.version == 2

    cluster, promoted = Cluster.deserialize(
        cluster.config.path,
        try_promote_to_submitter=True,
        deserialize_jobs=True,
    )
    assert promoted
    assert cluster.am_i_submitter()
    assert cluster.job_status is not None
    assert cluster.config.version == 3
Esempio n. 2
0
File: cluster.py Progetto: NREL/jade
def hostnames(output_dir, job_id, verbose):
    """Show the hostnames of active nodes participating in the batch."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)
    try:
        cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True)
    except InvalidConfiguration:
        print(
            f"{output_dir} is not a JADE output directory used in cluster mode",
            file=sys.stderr)
        sys.exit(1)

    if cluster.is_complete():
        print("All jobs are already complete.")
        sys.exit()

    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output_dir)
    nodes = []
    for _job_id in cluster.job_status.hpc_job_ids:
        if job_id is not None and _job_id != job_id:
            continue
        nodes += hpc_mgr.list_active_nodes(_job_id)

    if not nodes:
        print("No nodes were detected.", file=sys.stderr)
        sys.exit(1)

    print(" ".join(nodes))
Esempio n. 3
0
File: cluster.py Progetto: NREL/jade
def manager_node(output_dir, job_id, verbose):
    """Print the name of the manager node to the console. Requires a single job in the batch."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)
    try:
        cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True)
    except InvalidConfiguration:
        print(
            f"{output_dir} is not a JADE output directory used in cluster mode",
            file=sys.stderr)
        sys.exit(1)

    if cluster.is_complete():
        print("All jobs are already complete.")
        sys.exit()

    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output_dir)
    if job_id not in cluster.job_status.hpc_job_ids:
        print(f"job_id={job_id} is not active", file=sys.stderr)
        sys.exit(1)

    node = hpc_mgr.list_active_nodes(job_id)[0]
    print(node)
Esempio n. 4
0
    def run_submit_jobs(config_file, output, params, pipeline_stage_num=None):
        """Allows submission from an existing Python process."""
        os.makedirs(output, exist_ok=True)

        mgr = JobSubmitter.create(config_file, params, output=output)
        cluster = Cluster.create(
            output,
            mgr.config,
            pipeline_stage_num=pipeline_stage_num,
        )

        local = params.hpc_config.hpc_type == HpcType.LOCAL
        ret = 1
        try:
            status = mgr.submit_jobs(cluster, force_local=local)
            if status == Status.IN_PROGRESS:
                check_cmd = f"jade show-status -o {output}"
                if not params.dry_run:
                    print(
                        f"Jobs are in progress. Run '{check_cmd}' for updates."
                    )
                ret = 0
            else:
                ret = status.value
        finally:
            cluster.demote_from_submitter()
            if local:
                # These files were not used in this case.
                cluster.delete_files_internal()

        return ret
Esempio n. 5
0
def cancel_jobs(output, verbose):
    """Cancels jobs."""
    filename = os.path.join(output, "cancel_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="a")
    logger.info(get_cli_string())

    for _ in range(60):
        cluster, promoted = Cluster.deserialize(
            output,
            try_promote_to_submitter=True,
            deserialize_jobs=True,
        )
        if not promoted:
            logger.info("Did not get promoted. Sleep.")
            time.sleep(1)
            continue
        if cluster.is_complete():
            cluster.demote_from_submitter()
            logger.info("All jobs are already finished.")
            sys.exit(0)
        submitter = JobSubmitter.load(output)
        submitter.cancel_jobs(cluster)
        sys.exit(0)

    logger.error("Failed to get promoted to submitter.")
    sys.exit(1)
Esempio n. 6
0
 def _handle_submission_groups(self):
     # The JobConfiguration will not have any groups if the user didn't define any.
     # Reload from the cluster config.
     if not self._config.submission_groups:
         submission_groups = Cluster.deserialize_submission_groups(
             Path(self._output))
         assert len(submission_groups) == 1
         self._config.append_submission_group(submission_groups[0])
Esempio n. 7
0
def test_cluster__version_mismatch(cluster):
    cluster.demote_from_submitter()
    assert not cluster.am_i_submitter()
    with open(cluster._config_version_file, "w") as f_out:
        f_out.write(str(cluster.config.version + 1) + "\n")
    with open(cluster._job_status_version_file, "w") as f_out:
        f_out.write(str(cluster.job_status.version + 1) + "\n")

    try:
        with pytest.raises(ConfigVersionMismatch):
            cluster.promote_to_submitter()
    finally:
        os.remove(Cluster.get_lock_file(cluster.config.path))

    try:
        with pytest.raises(ConfigVersionMismatch):
            submitted_jobs = cluster.job_status.jobs
            cluster.update_job_status(submitted_jobs, [], set(), [], [1], 1)
    finally:
        os.remove(Cluster.get_lock_file(cluster.config.path))
Esempio n. 8
0
def show_times(output_dirs, verbose):
    """Show the run times of all allocated jobs."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)

    job_ids = []
    for output in output_dirs:
        path = Path(output)
        try:
            cluster, _ = Cluster.deserialize(path, deserialize_jobs=False)
        except InvalidConfiguration:
            print(
                f"{output} is not a JADE output directory used in cluster mode",
                file=sys.stderr)
            sys.exit(1)

        job_ids += [
            x.name.split("_")[2].replace(".e", "") for x in path.glob("*.e")
        ]

    job_ids.sort(key=lambda x: int(x))
    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output)

    total_duration = timedelta(seconds=0)
    table = PrettyTable()
    table.field_names = HpcJobStats._fields

    total_aus = 0
    if os.environ.get("NREL_CLUSTER") == "eagle":
        au_parser = get_nrel_eagle_aus
    else:
        au_parser = None

    for job_id in job_ids:
        stats = hpc_mgr.get_job_stats(job_id)
        if stats is None:
            continue
        duration = stats.end - stats.start
        if stats.state == HpcJobStatus.COMPLETE and isinstance(
                stats.end, datetime):
            total_duration += duration
        data = stats._asdict()
        data["state"] = data["state"].value
        if au_parser is not None:
            total_aus += au_parser(duration, stats.qos)
        table.add_row(data.values())

    print(table)
    print(f"\nTotal duration = {total_duration}")
    print("Total hours = {:.2f}".format(total_duration.total_seconds() / 3600))
    if au_parser is not None:
        print("Total AUs = {:.2f}".format(total_aus))
Esempio n. 9
0
def wait(output, poll_interval):
    """Wait for a JADE submission to complete."""
    while True:
        try:
            cluster, _ = Cluster.deserialize(output)
        except InvalidConfiguration:
            print(
                f"{output} is not a JADE output directory used in cluster mode",
                file=sys.stderr)
            sys.exit(1)
        if cluster.is_complete():
            print("All jobs are complete")
            break
        time.sleep(poll_interval * 60)
Esempio n. 10
0
def resubmit_jobs(output, failed, missing, verbose):
    """Resubmit failed and missing jobs."""
    event_file = os.path.join(output, "submit_jobs_events.log")
    setup_event_logging(event_file, mode="a")
    filename = os.path.join(output, "submit_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="a")

    cluster, promoted = Cluster.deserialize(
        output,
        try_promote_to_submitter=True,
        deserialize_jobs=True,
    )
    if not cluster.is_complete():
        cluster.demote_from_submitter()
        print(
            "resubmit-jobs requires that the existing submission be complete",
            file=sys.stderr)
        sys.exit(1)
    assert promoted

    jobs_to_resubmit = _get_jobs_to_resubmit(cluster, output, failed, missing)
    updated_blocking_jobs_by_name = _update_with_blocking_jobs(
        jobs_to_resubmit, output)
    _reset_results(output, jobs_to_resubmit)
    cluster.prepare_for_resubmission(jobs_to_resubmit,
                                     updated_blocking_jobs_by_name)

    ret = 1
    try:
        mgr = JobSubmitter.load(output)
        status = mgr.submit_jobs(cluster)
        if status == Status.IN_PROGRESS:
            print(f"Resubmitted {len(jobs_to_resubmit)} jobs in {output}")
            ret = 0
        else:
            ret = status.value
    except Exception:
        logger.exception("Failed to resubmit jobs")
        raise
    finally:
        cluster.demote_from_submitter()

    sys.exit(ret)
Esempio n. 11
0
def cluster():
    os.makedirs(OUTPUT, exist_ok=True)
    commands = ["echo 'hello'"] * 2
    cmd_file = os.path.join(OUTPUT, "commands.txt")
    with open(cmd_file, "w") as f_out:
        for cmd in commands:
            f_out.write(cmd + "\n")

    jade_config = GenericCommandConfiguration.auto_config(cmd_file)
    config_file = os.path.join(OUTPUT, CONFIG_FILE)
    jade_config.dump(config_file)
    hpc_config = HpcConfig(hpc_type="slurm", hpc=SlurmConfig(account="abc"))
    cluster = Cluster.create(OUTPUT, jade_config)

    yield cluster

    if os.path.exists(OUTPUT):
        shutil.rmtree(OUTPUT)
Esempio n. 12
0
File: cluster.py Progetto: NREL/jade
def am_i_manager(output_dir, verbose):
    """Print 'true' or 'false' depending on whether the current node is the manager node."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)
    try:
        cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True)
    except InvalidConfiguration:
        print(
            f"{output_dir} is not a JADE output directory used in cluster mode",
            file=sys.stderr)
        sys.exit(1)

    if cluster.is_complete():
        print("All jobs are already complete.")
        sys.exit()

    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output_dir)
    am_manager = hpc_mgr.am_i_manager()
    print(str(am_manager).lower(), end="")
Esempio n. 13
0
def list_active_ids(output_dirs, verbose):
    """List the HPC job IDs that are pending or running."""
    # TODO: add flag for only pending or only running
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)

    job_ids = []
    for output in output_dirs:
        path = Path(output)
        try:
            cluster, _ = Cluster.deserialize(path, deserialize_jobs=True)
        except InvalidConfiguration:
            print(
                f"{output} is not a JADE output directory used in cluster mode",
                file=sys.stderr)
            sys.exit(1)

        if not cluster.is_complete():
            job_ids += list(cluster.iter_hpc_job_ids())

    job_ids.sort(key=lambda x: int(x))
    print(" ".join(job_ids))
Esempio n. 14
0
    def _complete_hpc_job(self):
        """Complete the HPC job in the cluster. A future submitter could detect this. However,
        it's better to do it here for these reasons:

        1. If we don't do this and the user runs 'jade show-status -j' before the next submitter
           runs, a stale HPC job ID will show up.
        2. The last submitter will never detect its job as complete, and so the job status file
           will always include it.

        """
        job_id = self._intf.get_current_job_id()
        if job_id is None:
            # TODO: need to implement persistent recording of fake status for FakeManager
            return

        max_time_s = 30
        interval = 5
        completed = False
        for _ in range(max_time_s // interval):
            cluster, promoted = Cluster.deserialize(
                self._output,
                try_promote_to_submitter=True,
                deserialize_jobs=True)
            if promoted:
                try:
                    cluster.complete_hpc_job_id(job_id)
                finally:
                    cluster.demote_from_submitter()
                completed = True
                break
            time.sleep(interval)

        if not completed:
            logger.warning(
                "Could not promote to submitter. HPC job ID %s is still present",
                job_id)
Esempio n. 15
0
def resubmit_jobs(output, failed, missing, successful, submission_groups_file, verbose):
    """Resubmit jobs."""
    event_file = os.path.join(output, "submit_jobs_events.log")
    setup_event_logging(event_file, mode="a")
    filename = os.path.join(output, "submit_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, filename, file_level=level, console_level=level, mode="a")

    cluster, promoted = Cluster.deserialize(
        output,
        try_promote_to_submitter=True,
        deserialize_jobs=True,
    )
    if not cluster.is_complete():
        cluster.demote_from_submitter()
        print("resubmit-jobs requires that the existing submission be complete", file=sys.stderr)
        sys.exit(1)
    assert promoted

    if submission_groups_file is not None:
        groups = load_data(submission_groups_file)
        cur = len(groups)
        orig = len(cluster.config.submission_groups)
        if cur != orig:
            print(
                f"Length of submission_groups ({cur}) must be identical to the original ({orig})",
                file=sys.stderr,
            )
            cluster.demote_from_submitter()
            sys.exit(1)

        for _group in groups:
            group = SubmissionGroup(**_group)
            found = False
            for i, orig_group in enumerate(cluster.config.submission_groups):
                if group.name == orig_group.name:
                    cluster.config.submission_groups[i] = group
                    found = True
                    break
            if not found:
                print(
                    f"submission group {group.name} does not exist in the original",
                    file=sys.stderr,
                )
                cluster.demote_from_submitter()
                sys.exit(1)
        logger.info("Updated submitter parameters from %s", submission_groups_file)

    jobs_to_resubmit = _get_jobs_to_resubmit(cluster, output, failed, missing, successful)
    updated_blocking_jobs_by_name = _update_with_blocking_jobs(jobs_to_resubmit, output)
    _reset_results(output, jobs_to_resubmit)
    cluster.prepare_for_resubmission(jobs_to_resubmit, updated_blocking_jobs_by_name)

    ret = 1
    try:
        mgr = JobSubmitter.load(output)
        status = mgr.submit_jobs(cluster)
        if status == Status.IN_PROGRESS:
            print(f"Resubmitted {len(jobs_to_resubmit)} jobs in {output}")
            ret = 0
        else:
            ret = status.value
    except Exception:
        logger.exception("Failed to resubmit jobs")
        raise
    finally:
        cluster.demote_from_submitter()

    sys.exit(ret)