def hostnames(output_dir, job_id, verbose): """Show the hostnames of active nodes participating in the batch.""" level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, None, console_level=level) try: cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True) except InvalidConfiguration: print( f"{output_dir} is not a JADE output directory used in cluster mode", file=sys.stderr) sys.exit(1) if cluster.is_complete(): print("All jobs are already complete.") sys.exit() groups = make_submission_group_lookup( [cluster.config.submission_groups[0]]) hpc_mgr = HpcManager(groups, output_dir) nodes = [] for _job_id in cluster.job_status.hpc_job_ids: if job_id is not None and _job_id != job_id: continue nodes += hpc_mgr.list_active_nodes(_job_id) if not nodes: print("No nodes were detected.", file=sys.stderr) sys.exit(1) print(" ".join(nodes))
def manager_node(output_dir, job_id, verbose): """Print the name of the manager node to the console. Requires a single job in the batch.""" level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, None, console_level=level) try: cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True) except InvalidConfiguration: print( f"{output_dir} is not a JADE output directory used in cluster mode", file=sys.stderr) sys.exit(1) if cluster.is_complete(): print("All jobs are already complete.") sys.exit() groups = make_submission_group_lookup( [cluster.config.submission_groups[0]]) hpc_mgr = HpcManager(groups, output_dir) if job_id not in cluster.job_status.hpc_job_ids: print(f"job_id={job_id} is not active", file=sys.stderr) sys.exit(1) node = hpc_mgr.list_active_nodes(job_id)[0] print(node)
def cancel_jobs(self, cluster): """Cancel running and pending jobs.""" groups = make_submission_group_lookup(cluster.config.submission_groups) hpc = HpcManager(groups, self._output) for job_id in cluster.job_status.hpc_job_ids: hpc.cancel_job(job_id) cluster.mark_canceled()
def show_times(output_dirs, verbose): """Show the run times of all allocated jobs.""" level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, None, console_level=level) job_ids = [] for output in output_dirs: path = Path(output) try: cluster, _ = Cluster.deserialize(path, deserialize_jobs=False) except InvalidConfiguration: print( f"{output} is not a JADE output directory used in cluster mode", file=sys.stderr) sys.exit(1) job_ids += [ x.name.split("_")[2].replace(".e", "") for x in path.glob("*.e") ] job_ids.sort(key=lambda x: int(x)) groups = make_submission_group_lookup( [cluster.config.submission_groups[0]]) hpc_mgr = HpcManager(groups, output) total_duration = timedelta(seconds=0) table = PrettyTable() table.field_names = HpcJobStats._fields total_aus = 0 if os.environ.get("NREL_CLUSTER") == "eagle": au_parser = get_nrel_eagle_aus else: au_parser = None for job_id in job_ids: stats = hpc_mgr.get_job_stats(job_id) if stats is None: continue duration = stats.end - stats.start if stats.state == HpcJobStatus.COMPLETE and isinstance( stats.end, datetime): total_duration += duration data = stats._asdict() data["state"] = data["state"].value if au_parser is not None: total_aus += au_parser(duration, stats.qos) table.add_row(data.values()) print(table) print(f"\nTotal duration = {total_duration}") print("Total hours = {:.2f}".format(total_duration.total_seconds() / 3600)) if au_parser is not None: print("Total AUs = {:.2f}".format(total_aus))
def am_i_manager(output_dir, verbose): """Print 'true' or 'false' depending on whether the current node is the manager node.""" level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, None, console_level=level) try: cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True) except InvalidConfiguration: print( f"{output_dir} is not a JADE output directory used in cluster mode", file=sys.stderr) sys.exit(1) if cluster.is_complete(): print("All jobs are already complete.") sys.exit() groups = make_submission_group_lookup( [cluster.config.submission_groups[0]]) hpc_mgr = HpcManager(groups, output_dir) am_manager = hpc_mgr.am_i_manager() print(str(am_manager).lower(), end="")
def _make_async_submitter(self, jobs, num_processes, output, verbose): config = copy.copy(self._base_config) config["jobs"] = jobs suffix = f"_batch_{self._batch_index}" self._batch_index += 1 new_config_file = self._config_file.replace(".json", f"{suffix}.json") dump_data(config, new_config_file, cls=ExtendedJSONEncoder) logger.info("Created split config file %s with %s jobs", new_config_file, len(config["jobs"])) run_script = os.path.join(output, f"run{suffix}.sh") self._create_run_script(new_config_file, run_script, num_processes, output, verbose) hpc_mgr = HpcManager(self._hpc_config_file, output) name = self._name + suffix return AsyncHpcSubmitter(hpc_mgr, run_script, name, output)
def create_hpc_manager(cluster, config): os.environ["NREL_CLUSTER"] = cluster mgr = None try: hpc_file = "test-hpc-config.toml" dump_data(config, hpc_file) mgr = HpcManager(hpc_file, OUTPUT_DIR) finally: os.remove(hpc_file) if cluster == "eagle": assert mgr.hpc_type == HpcType.SLURM elif cluster == "peregrine": assert mgr.hpc_type == HpcType.PBS else: assert False, "unknown cluster={}".format(cluster) return mgr
def __init__(self, config: JobConfiguration, config_file, cluster: Cluster, output): self._config = config self._submission_groups = make_submission_group_lookup( cluster.config.submission_groups) self._config_file = config_file self._base_config = config.serialize() self._batch_index = cluster.job_status.batch_index self._cluster = cluster self._hpc_mgr = HpcManager(self._submission_groups, output) self._output = output # Limitation: these settings apply to all groups in aggregate. # This could be made more flexible if needed. group = next(iter(self._submission_groups.values())) self._max_nodes = group.submitter_params.max_nodes if self._max_nodes is None: self._max_nodes = sys.maxsize self._poll_interval = group.submitter_params.poll_interval self._status_collector = HpcStatusCollector(self._hpc_mgr, self._poll_interval)
def __init__( self, config_file, output, batch_id=0, ): super(JobRunner, self).__init__(config_file, output) self._handle_submission_groups() group = self.config.get_default_submission_group() config = group.submitter_params.hpc_config self._intf = HpcManager.create_hpc_interface(config) self._node_id = self._intf.get_node_id() self._intf_type = config.hpc_type self._batch_id = batch_id self._event_filename = os.path.join( output, f"run_jobs_batch_{batch_id}_{self._node_id}_events.log", ) self._event_logger = None logger.debug("Constructed JobRunner output=%s batch=%s", output, batch_id)
def submit_jobs(self, cluster, force_local=False): """Submit simulations. Auto-detect whether the current system is an HPC and submit to its queue. Otherwise, run locally. Parameters ---------- cluster : Cluster force_local : bool If on HPC, run jobs through subprocess as if local. Returns ------- Status """ if self._is_new: logger.info("Submit %s jobs for execution.", self._config.get_num_jobs()) logger.info("JADE version %s", jade.version.__version__) registry = Registry() loggers = registry.list_loggers() logger.info("Registered modules for logging: %s", ", ".join(loggers)) self._save_repository_info(registry) ResultsAggregator.create(self._output) # If an events summary file exists, it is invalid. events_file = os.path.join(self._output, EVENTS_FILENAME) if os.path.exists(events_file): os.remove(events_file) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_SUBMIT_COMPLETED, message="job submission started", num_jobs=self.get_num_jobs(), ) log_event(event) os.environ["JADE_RUNTIME_OUTPUT"] = self._output if self._config.setup_command is not None: cmd = f"JADE_RUNTIME_OUTPUT={self._output} {self._config.setup_command}" logger.info("Running setup command: %s", cmd) check_run_command(self._config.setup_command) else: self._handle_submission_groups() result = Status.IN_PROGRESS group = self._config.get_default_submission_group() groups = make_submission_group_lookup(cluster.config.submission_groups) self._hpc = HpcManager(groups, self._output) if self._hpc.hpc_type == HpcType.LOCAL or force_local: runner = JobRunner(self._config_file, output=self._output) num_processes = group.submitter_params.num_processes verbose = group.submitter_params.verbose result = runner.run_jobs(verbose=verbose, num_processes=num_processes) agg = ResultsAggregator.load(self._output) agg.process_results() is_complete = True else: is_complete = self._submit_to_hpc(cluster) if is_complete: result = self._handle_completion(cluster) return result
def test_create_slurm_invalid_file(hpc_fixture): os.environ["NREL_CLUSTER"] = "eagle" with pytest.raises(FileNotFoundError): HpcManager("invalid_filename", OUTPUT_DIR)
def submit_jobs(self, name="job", per_node_batch_size=DEFAULTS["per_node_batch_size"], max_nodes=DEFAULTS["max_nodes"], force_local=False, verbose=False, poll_interval=DEFAULTS["poll_interval"], num_processes=None, previous_results=None, reports=True, try_add_blocked_jobs=False): """Submit simulations. Auto-detect whether the current system is an HPC and submit to its queue. Otherwise, run locally. Parameters ---------- name : str batch name, applies to HPC job submission only per_node_batch_size : int Number of jobs to run on one node in one batch. max_nodes : int Max number of node submission requests to make in parallel. force_local : bool If on HPC, run jobs through subprocess as if local. wait : bool Don't return until HPC jobs have finished. verbose : bool Enable debug logging. poll_interval : int Inteval in seconds on which to poll jobs. num_processes : int Number of processes to run in parallel; defaults to num CPUs Returns ------- Status """ logger.info("Submit %s jobs for execution.", self._config.get_num_jobs()) logger.info("JADE version %s", jade.version.__version__) registry = Registry() loggers = registry.list_loggers() logger.info("Registered modules for logging: %s", ", ".join(loggers)) self._save_repository_info(registry) self._config.check_job_dependencies() self._hpc = HpcManager(self._hpc_config_file, self._output) result = Status.GOOD # If an events summary file exists, it is invalid. events_file = os.path.join(self._output, EVENTS_FILENAME) if os.path.exists(events_file): os.remove(events_file) start_time = time.time() if self._hpc.hpc_type == HpcType.LOCAL or force_local: runner = JobRunner(self._config_file, output=self._output) result = runner.run_jobs(verbose=verbose, num_processes=num_processes) else: self._submit_to_hpc(name, max_nodes, per_node_batch_size, verbose, poll_interval, num_processes, try_add_blocked_jobs) results_summary = ResultsAggregatorSummary(self._results_dir) self._results = results_summary.get_results() if len(self._results) != self._config.get_num_jobs(): logger.error( "Number of results doesn't match number of jobs: " "results=%s jobs=%s. Check for process crashes " "or HPC timeouts.", len(self._results), self._config.get_num_jobs()) result = Status.ERROR if previous_results: self._results += previous_results self.write_results(RESULTS_FILE) results_summary.delete_files() shutil.rmtree(self._results_dir) self._log_error_log_messages(self._output) bytes_consumed = get_directory_size_bytes(self._output, recursive=False) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="main output directory size", bytes_consumed=bytes_consumed, ) log_event(event) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_CONFIG_EXEC_SUMMARY, message="config execution summary", config_execution_time=time.time() - start_time, num_jobs=self.get_num_jobs(), ) log_event(event) if reports: self.generate_reports(self._output) return result