Example #1
0
File: cluster.py Project: NREL/jade
def hostnames(output_dir, job_id, verbose):
    """Show the hostnames of active nodes participating in the batch."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)
    try:
        cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True)
    except InvalidConfiguration:
        print(
            f"{output_dir} is not a JADE output directory used in cluster mode",
            file=sys.stderr)
        sys.exit(1)

    if cluster.is_complete():
        print("All jobs are already complete.")
        sys.exit()

    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output_dir)
    nodes = []
    for _job_id in cluster.job_status.hpc_job_ids:
        if job_id is not None and _job_id != job_id:
            continue
        nodes += hpc_mgr.list_active_nodes(_job_id)

    if not nodes:
        print("No nodes were detected.", file=sys.stderr)
        sys.exit(1)

    print(" ".join(nodes))
Example #2
0
File: cluster.py Project: NREL/jade
def manager_node(output_dir, job_id, verbose):
    """Print the name of the manager node to the console. Requires a single job in the batch."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)
    try:
        cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True)
    except InvalidConfiguration:
        print(
            f"{output_dir} is not a JADE output directory used in cluster mode",
            file=sys.stderr)
        sys.exit(1)

    if cluster.is_complete():
        print("All jobs are already complete.")
        sys.exit()

    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output_dir)
    if job_id not in cluster.job_status.hpc_job_ids:
        print(f"job_id={job_id} is not active", file=sys.stderr)
        sys.exit(1)

    node = hpc_mgr.list_active_nodes(job_id)[0]
    print(node)
Example #3
0
    def cancel_jobs(self, cluster):
        """Cancel running and pending jobs."""
        groups = make_submission_group_lookup(cluster.config.submission_groups)
        hpc = HpcManager(groups, self._output)
        for job_id in cluster.job_status.hpc_job_ids:
            hpc.cancel_job(job_id)

        cluster.mark_canceled()
Example #4
0
def show_times(output_dirs, verbose):
    """Show the run times of all allocated jobs."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)

    job_ids = []
    for output in output_dirs:
        path = Path(output)
        try:
            cluster, _ = Cluster.deserialize(path, deserialize_jobs=False)
        except InvalidConfiguration:
            print(
                f"{output} is not a JADE output directory used in cluster mode",
                file=sys.stderr)
            sys.exit(1)

        job_ids += [
            x.name.split("_")[2].replace(".e", "") for x in path.glob("*.e")
        ]

    job_ids.sort(key=lambda x: int(x))
    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output)

    total_duration = timedelta(seconds=0)
    table = PrettyTable()
    table.field_names = HpcJobStats._fields

    total_aus = 0
    if os.environ.get("NREL_CLUSTER") == "eagle":
        au_parser = get_nrel_eagle_aus
    else:
        au_parser = None

    for job_id in job_ids:
        stats = hpc_mgr.get_job_stats(job_id)
        if stats is None:
            continue
        duration = stats.end - stats.start
        if stats.state == HpcJobStatus.COMPLETE and isinstance(
                stats.end, datetime):
            total_duration += duration
        data = stats._asdict()
        data["state"] = data["state"].value
        if au_parser is not None:
            total_aus += au_parser(duration, stats.qos)
        table.add_row(data.values())

    print(table)
    print(f"\nTotal duration = {total_duration}")
    print("Total hours = {:.2f}".format(total_duration.total_seconds() / 3600))
    if au_parser is not None:
        print("Total AUs = {:.2f}".format(total_aus))
Example #5
0
File: cluster.py Project: NREL/jade
def am_i_manager(output_dir, verbose):
    """Print 'true' or 'false' depending on whether the current node is the manager node."""
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, None, console_level=level)
    try:
        cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True)
    except InvalidConfiguration:
        print(
            f"{output_dir} is not a JADE output directory used in cluster mode",
            file=sys.stderr)
        sys.exit(1)

    if cluster.is_complete():
        print("All jobs are already complete.")
        sys.exit()

    groups = make_submission_group_lookup(
        [cluster.config.submission_groups[0]])
    hpc_mgr = HpcManager(groups, output_dir)
    am_manager = hpc_mgr.am_i_manager()
    print(str(am_manager).lower(), end="")
Example #6
0
    def _make_async_submitter(self, jobs, num_processes, output, verbose):
        config = copy.copy(self._base_config)
        config["jobs"] = jobs
        suffix = f"_batch_{self._batch_index}"
        self._batch_index += 1
        new_config_file = self._config_file.replace(".json", f"{suffix}.json")
        dump_data(config, new_config_file, cls=ExtendedJSONEncoder)
        logger.info("Created split config file %s with %s jobs",
                    new_config_file, len(config["jobs"]))

        run_script = os.path.join(output, f"run{suffix}.sh")
        self._create_run_script(new_config_file, run_script, num_processes,
                                output, verbose)

        hpc_mgr = HpcManager(self._hpc_config_file, output)
        name = self._name + suffix
        return AsyncHpcSubmitter(hpc_mgr, run_script, name, output)
Example #7
0
def create_hpc_manager(cluster, config):
    os.environ["NREL_CLUSTER"] = cluster
    mgr = None
    try:
        hpc_file = "test-hpc-config.toml"
        dump_data(config, hpc_file)

        mgr = HpcManager(hpc_file, OUTPUT_DIR)
    finally:
        os.remove(hpc_file)

    if cluster == "eagle":
        assert mgr.hpc_type == HpcType.SLURM
    elif cluster == "peregrine":
        assert mgr.hpc_type == HpcType.PBS
    else:
        assert False, "unknown cluster={}".format(cluster)

    return mgr
Example #8
0
    def __init__(self, config: JobConfiguration, config_file, cluster: Cluster,
                 output):
        self._config = config
        self._submission_groups = make_submission_group_lookup(
            cluster.config.submission_groups)
        self._config_file = config_file
        self._base_config = config.serialize()
        self._batch_index = cluster.job_status.batch_index
        self._cluster = cluster
        self._hpc_mgr = HpcManager(self._submission_groups, output)
        self._output = output

        # Limitation: these settings apply to all groups in aggregate.
        # This could be made more flexible if needed.
        group = next(iter(self._submission_groups.values()))
        self._max_nodes = group.submitter_params.max_nodes
        if self._max_nodes is None:
            self._max_nodes = sys.maxsize
        self._poll_interval = group.submitter_params.poll_interval
        self._status_collector = HpcStatusCollector(self._hpc_mgr,
                                                    self._poll_interval)
Example #9
0
    def __init__(
        self,
        config_file,
        output,
        batch_id=0,
    ):
        super(JobRunner, self).__init__(config_file, output)
        self._handle_submission_groups()
        group = self.config.get_default_submission_group()
        config = group.submitter_params.hpc_config
        self._intf = HpcManager.create_hpc_interface(config)
        self._node_id = self._intf.get_node_id()
        self._intf_type = config.hpc_type
        self._batch_id = batch_id
        self._event_filename = os.path.join(
            output,
            f"run_jobs_batch_{batch_id}_{self._node_id}_events.log",
        )
        self._event_logger = None

        logger.debug("Constructed JobRunner output=%s batch=%s", output,
                     batch_id)
Example #10
0
    def submit_jobs(self, cluster, force_local=False):
        """Submit simulations. Auto-detect whether the current system is an HPC
        and submit to its queue. Otherwise, run locally.

        Parameters
        ----------
        cluster : Cluster
        force_local : bool
            If on HPC, run jobs through subprocess as if local.

        Returns
        -------
        Status

        """
        if self._is_new:
            logger.info("Submit %s jobs for execution.",
                        self._config.get_num_jobs())
            logger.info("JADE version %s", jade.version.__version__)
            registry = Registry()
            loggers = registry.list_loggers()
            logger.info("Registered modules for logging: %s",
                        ", ".join(loggers))
            self._save_repository_info(registry)

            ResultsAggregator.create(self._output)

            # If an events summary file exists, it is invalid.
            events_file = os.path.join(self._output, EVENTS_FILENAME)
            if os.path.exists(events_file):
                os.remove(events_file)

            event = StructuredLogEvent(
                source="submitter",
                category=EVENT_CATEGORY_RESOURCE_UTIL,
                name=EVENT_NAME_SUBMIT_COMPLETED,
                message="job submission started",
                num_jobs=self.get_num_jobs(),
            )
            log_event(event)

            os.environ["JADE_RUNTIME_OUTPUT"] = self._output
            if self._config.setup_command is not None:
                cmd = f"JADE_RUNTIME_OUTPUT={self._output} {self._config.setup_command}"
                logger.info("Running setup command: %s", cmd)
                check_run_command(self._config.setup_command)
        else:
            self._handle_submission_groups()

        result = Status.IN_PROGRESS
        group = self._config.get_default_submission_group()
        groups = make_submission_group_lookup(cluster.config.submission_groups)
        self._hpc = HpcManager(groups, self._output)

        if self._hpc.hpc_type == HpcType.LOCAL or force_local:
            runner = JobRunner(self._config_file, output=self._output)
            num_processes = group.submitter_params.num_processes
            verbose = group.submitter_params.verbose
            result = runner.run_jobs(verbose=verbose,
                                     num_processes=num_processes)
            agg = ResultsAggregator.load(self._output)
            agg.process_results()
            is_complete = True
        else:
            is_complete = self._submit_to_hpc(cluster)

        if is_complete:
            result = self._handle_completion(cluster)

        return result
Example #11
0
def test_create_slurm_invalid_file(hpc_fixture):
    os.environ["NREL_CLUSTER"] = "eagle"
    with pytest.raises(FileNotFoundError):
        HpcManager("invalid_filename", OUTPUT_DIR)
Example #12
0
    def submit_jobs(self,
                    name="job",
                    per_node_batch_size=DEFAULTS["per_node_batch_size"],
                    max_nodes=DEFAULTS["max_nodes"],
                    force_local=False,
                    verbose=False,
                    poll_interval=DEFAULTS["poll_interval"],
                    num_processes=None,
                    previous_results=None,
                    reports=True,
                    try_add_blocked_jobs=False):
        """Submit simulations. Auto-detect whether the current system is an HPC
        and submit to its queue. Otherwise, run locally.

        Parameters
        ----------
        name : str
            batch name, applies to HPC job submission only
        per_node_batch_size : int
            Number of jobs to run on one node in one batch.
        max_nodes : int
            Max number of node submission requests to make in parallel.
        force_local : bool
            If on HPC, run jobs through subprocess as if local.
        wait : bool
            Don't return until HPC jobs have finished.
        verbose : bool
            Enable debug logging.
        poll_interval : int
            Inteval in seconds on which to poll jobs.
        num_processes : int
            Number of processes to run in parallel; defaults to num CPUs

        Returns
        -------
        Status

        """
        logger.info("Submit %s jobs for execution.",
                    self._config.get_num_jobs())
        logger.info("JADE version %s", jade.version.__version__)
        registry = Registry()
        loggers = registry.list_loggers()
        logger.info("Registered modules for logging: %s", ", ".join(loggers))
        self._save_repository_info(registry)

        self._config.check_job_dependencies()

        self._hpc = HpcManager(self._hpc_config_file, self._output)
        result = Status.GOOD

        # If an events summary file exists, it is invalid.
        events_file = os.path.join(self._output, EVENTS_FILENAME)
        if os.path.exists(events_file):
            os.remove(events_file)

        start_time = time.time()
        if self._hpc.hpc_type == HpcType.LOCAL or force_local:
            runner = JobRunner(self._config_file, output=self._output)
            result = runner.run_jobs(verbose=verbose,
                                     num_processes=num_processes)
        else:
            self._submit_to_hpc(name, max_nodes, per_node_batch_size, verbose,
                                poll_interval, num_processes,
                                try_add_blocked_jobs)

        results_summary = ResultsAggregatorSummary(self._results_dir)
        self._results = results_summary.get_results()
        if len(self._results) != self._config.get_num_jobs():
            logger.error(
                "Number of results doesn't match number of jobs: "
                "results=%s jobs=%s. Check for process crashes "
                "or HPC timeouts.", len(self._results),
                self._config.get_num_jobs())
            result = Status.ERROR

        if previous_results:
            self._results += previous_results

        self.write_results(RESULTS_FILE)
        results_summary.delete_files()
        shutil.rmtree(self._results_dir)

        self._log_error_log_messages(self._output)

        bytes_consumed = get_directory_size_bytes(self._output,
                                                  recursive=False)
        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="main output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)

        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_CONFIG_EXEC_SUMMARY,
            message="config execution summary",
            config_execution_time=time.time() - start_time,
            num_jobs=self.get_num_jobs(),
        )
        log_event(event)

        if reports:
            self.generate_reports(self._output)

        return result