def test_cluster__create(cluster): assert cluster.config.num_jobs == 2 assert cluster.config.submitted_jobs == 0 assert cluster.config.completed_jobs == 0 assert not cluster.all_jobs_submitted() assert cluster.has_submitter() assert cluster.am_i_submitter() assert cluster.config.version == 1 assert cluster.job_status.version == 1 cluster.demote_from_submitter() assert not cluster.am_i_submitter() assert cluster.config.version == 2 cluster, _ = Cluster.deserialize(cluster.config.path) assert not cluster.has_submitter() assert not cluster.am_i_submitter() assert cluster.job_status is None assert cluster.config.num_jobs == 2 assert cluster.config.submitted_jobs == 0 assert cluster.config.completed_jobs == 0 assert cluster.config.version == 2 cluster, promoted = Cluster.deserialize( cluster.config.path, try_promote_to_submitter=True, deserialize_jobs=True, ) assert promoted assert cluster.am_i_submitter() assert cluster.job_status is not None assert cluster.config.version == 3
def hostnames(output_dir, job_id, verbose): """Show the hostnames of active nodes participating in the batch.""" level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, None, console_level=level) try: cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True) except InvalidConfiguration: print( f"{output_dir} is not a JADE output directory used in cluster mode", file=sys.stderr) sys.exit(1) if cluster.is_complete(): print("All jobs are already complete.") sys.exit() groups = make_submission_group_lookup( [cluster.config.submission_groups[0]]) hpc_mgr = HpcManager(groups, output_dir) nodes = [] for _job_id in cluster.job_status.hpc_job_ids: if job_id is not None and _job_id != job_id: continue nodes += hpc_mgr.list_active_nodes(_job_id) if not nodes: print("No nodes were detected.", file=sys.stderr) sys.exit(1) print(" ".join(nodes))
def manager_node(output_dir, job_id, verbose): """Print the name of the manager node to the console. Requires a single job in the batch.""" level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, None, console_level=level) try: cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True) except InvalidConfiguration: print( f"{output_dir} is not a JADE output directory used in cluster mode", file=sys.stderr) sys.exit(1) if cluster.is_complete(): print("All jobs are already complete.") sys.exit() groups = make_submission_group_lookup( [cluster.config.submission_groups[0]]) hpc_mgr = HpcManager(groups, output_dir) if job_id not in cluster.job_status.hpc_job_ids: print(f"job_id={job_id} is not active", file=sys.stderr) sys.exit(1) node = hpc_mgr.list_active_nodes(job_id)[0] print(node)
def run_submit_jobs(config_file, output, params, pipeline_stage_num=None): """Allows submission from an existing Python process.""" os.makedirs(output, exist_ok=True) mgr = JobSubmitter.create(config_file, params, output=output) cluster = Cluster.create( output, mgr.config, pipeline_stage_num=pipeline_stage_num, ) local = params.hpc_config.hpc_type == HpcType.LOCAL ret = 1 try: status = mgr.submit_jobs(cluster, force_local=local) if status == Status.IN_PROGRESS: check_cmd = f"jade show-status -o {output}" if not params.dry_run: print( f"Jobs are in progress. Run '{check_cmd}' for updates." ) ret = 0 else: ret = status.value finally: cluster.demote_from_submitter() if local: # These files were not used in this case. cluster.delete_files_internal() return ret
def cancel_jobs(output, verbose): """Cancels jobs.""" filename = os.path.join(output, "cancel_jobs.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="a") logger.info(get_cli_string()) for _ in range(60): cluster, promoted = Cluster.deserialize( output, try_promote_to_submitter=True, deserialize_jobs=True, ) if not promoted: logger.info("Did not get promoted. Sleep.") time.sleep(1) continue if cluster.is_complete(): cluster.demote_from_submitter() logger.info("All jobs are already finished.") sys.exit(0) submitter = JobSubmitter.load(output) submitter.cancel_jobs(cluster) sys.exit(0) logger.error("Failed to get promoted to submitter.") sys.exit(1)
def _handle_submission_groups(self): # The JobConfiguration will not have any groups if the user didn't define any. # Reload from the cluster config. if not self._config.submission_groups: submission_groups = Cluster.deserialize_submission_groups( Path(self._output)) assert len(submission_groups) == 1 self._config.append_submission_group(submission_groups[0])
def test_cluster__version_mismatch(cluster): cluster.demote_from_submitter() assert not cluster.am_i_submitter() with open(cluster._config_version_file, "w") as f_out: f_out.write(str(cluster.config.version + 1) + "\n") with open(cluster._job_status_version_file, "w") as f_out: f_out.write(str(cluster.job_status.version + 1) + "\n") try: with pytest.raises(ConfigVersionMismatch): cluster.promote_to_submitter() finally: os.remove(Cluster.get_lock_file(cluster.config.path)) try: with pytest.raises(ConfigVersionMismatch): submitted_jobs = cluster.job_status.jobs cluster.update_job_status(submitted_jobs, [], set(), [], [1], 1) finally: os.remove(Cluster.get_lock_file(cluster.config.path))
def show_times(output_dirs, verbose): """Show the run times of all allocated jobs.""" level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, None, console_level=level) job_ids = [] for output in output_dirs: path = Path(output) try: cluster, _ = Cluster.deserialize(path, deserialize_jobs=False) except InvalidConfiguration: print( f"{output} is not a JADE output directory used in cluster mode", file=sys.stderr) sys.exit(1) job_ids += [ x.name.split("_")[2].replace(".e", "") for x in path.glob("*.e") ] job_ids.sort(key=lambda x: int(x)) groups = make_submission_group_lookup( [cluster.config.submission_groups[0]]) hpc_mgr = HpcManager(groups, output) total_duration = timedelta(seconds=0) table = PrettyTable() table.field_names = HpcJobStats._fields total_aus = 0 if os.environ.get("NREL_CLUSTER") == "eagle": au_parser = get_nrel_eagle_aus else: au_parser = None for job_id in job_ids: stats = hpc_mgr.get_job_stats(job_id) if stats is None: continue duration = stats.end - stats.start if stats.state == HpcJobStatus.COMPLETE and isinstance( stats.end, datetime): total_duration += duration data = stats._asdict() data["state"] = data["state"].value if au_parser is not None: total_aus += au_parser(duration, stats.qos) table.add_row(data.values()) print(table) print(f"\nTotal duration = {total_duration}") print("Total hours = {:.2f}".format(total_duration.total_seconds() / 3600)) if au_parser is not None: print("Total AUs = {:.2f}".format(total_aus))
def wait(output, poll_interval): """Wait for a JADE submission to complete.""" while True: try: cluster, _ = Cluster.deserialize(output) except InvalidConfiguration: print( f"{output} is not a JADE output directory used in cluster mode", file=sys.stderr) sys.exit(1) if cluster.is_complete(): print("All jobs are complete") break time.sleep(poll_interval * 60)
def resubmit_jobs(output, failed, missing, verbose): """Resubmit failed and missing jobs.""" event_file = os.path.join(output, "submit_jobs_events.log") setup_event_logging(event_file, mode="a") filename = os.path.join(output, "submit_jobs.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="a") cluster, promoted = Cluster.deserialize( output, try_promote_to_submitter=True, deserialize_jobs=True, ) if not cluster.is_complete(): cluster.demote_from_submitter() print( "resubmit-jobs requires that the existing submission be complete", file=sys.stderr) sys.exit(1) assert promoted jobs_to_resubmit = _get_jobs_to_resubmit(cluster, output, failed, missing) updated_blocking_jobs_by_name = _update_with_blocking_jobs( jobs_to_resubmit, output) _reset_results(output, jobs_to_resubmit) cluster.prepare_for_resubmission(jobs_to_resubmit, updated_blocking_jobs_by_name) ret = 1 try: mgr = JobSubmitter.load(output) status = mgr.submit_jobs(cluster) if status == Status.IN_PROGRESS: print(f"Resubmitted {len(jobs_to_resubmit)} jobs in {output}") ret = 0 else: ret = status.value except Exception: logger.exception("Failed to resubmit jobs") raise finally: cluster.demote_from_submitter() sys.exit(ret)
def cluster(): os.makedirs(OUTPUT, exist_ok=True) commands = ["echo 'hello'"] * 2 cmd_file = os.path.join(OUTPUT, "commands.txt") with open(cmd_file, "w") as f_out: for cmd in commands: f_out.write(cmd + "\n") jade_config = GenericCommandConfiguration.auto_config(cmd_file) config_file = os.path.join(OUTPUT, CONFIG_FILE) jade_config.dump(config_file) hpc_config = HpcConfig(hpc_type="slurm", hpc=SlurmConfig(account="abc")) cluster = Cluster.create(OUTPUT, jade_config) yield cluster if os.path.exists(OUTPUT): shutil.rmtree(OUTPUT)
def am_i_manager(output_dir, verbose): """Print 'true' or 'false' depending on whether the current node is the manager node.""" level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, None, console_level=level) try: cluster, _ = Cluster.deserialize(output_dir, deserialize_jobs=True) except InvalidConfiguration: print( f"{output_dir} is not a JADE output directory used in cluster mode", file=sys.stderr) sys.exit(1) if cluster.is_complete(): print("All jobs are already complete.") sys.exit() groups = make_submission_group_lookup( [cluster.config.submission_groups[0]]) hpc_mgr = HpcManager(groups, output_dir) am_manager = hpc_mgr.am_i_manager() print(str(am_manager).lower(), end="")
def list_active_ids(output_dirs, verbose): """List the HPC job IDs that are pending or running.""" # TODO: add flag for only pending or only running level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, None, console_level=level) job_ids = [] for output in output_dirs: path = Path(output) try: cluster, _ = Cluster.deserialize(path, deserialize_jobs=True) except InvalidConfiguration: print( f"{output} is not a JADE output directory used in cluster mode", file=sys.stderr) sys.exit(1) if not cluster.is_complete(): job_ids += list(cluster.iter_hpc_job_ids()) job_ids.sort(key=lambda x: int(x)) print(" ".join(job_ids))
def _complete_hpc_job(self): """Complete the HPC job in the cluster. A future submitter could detect this. However, it's better to do it here for these reasons: 1. If we don't do this and the user runs 'jade show-status -j' before the next submitter runs, a stale HPC job ID will show up. 2. The last submitter will never detect its job as complete, and so the job status file will always include it. """ job_id = self._intf.get_current_job_id() if job_id is None: # TODO: need to implement persistent recording of fake status for FakeManager return max_time_s = 30 interval = 5 completed = False for _ in range(max_time_s // interval): cluster, promoted = Cluster.deserialize( self._output, try_promote_to_submitter=True, deserialize_jobs=True) if promoted: try: cluster.complete_hpc_job_id(job_id) finally: cluster.demote_from_submitter() completed = True break time.sleep(interval) if not completed: logger.warning( "Could not promote to submitter. HPC job ID %s is still present", job_id)
def resubmit_jobs(output, failed, missing, successful, submission_groups_file, verbose): """Resubmit jobs.""" event_file = os.path.join(output, "submit_jobs_events.log") setup_event_logging(event_file, mode="a") filename = os.path.join(output, "submit_jobs.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="a") cluster, promoted = Cluster.deserialize( output, try_promote_to_submitter=True, deserialize_jobs=True, ) if not cluster.is_complete(): cluster.demote_from_submitter() print("resubmit-jobs requires that the existing submission be complete", file=sys.stderr) sys.exit(1) assert promoted if submission_groups_file is not None: groups = load_data(submission_groups_file) cur = len(groups) orig = len(cluster.config.submission_groups) if cur != orig: print( f"Length of submission_groups ({cur}) must be identical to the original ({orig})", file=sys.stderr, ) cluster.demote_from_submitter() sys.exit(1) for _group in groups: group = SubmissionGroup(**_group) found = False for i, orig_group in enumerate(cluster.config.submission_groups): if group.name == orig_group.name: cluster.config.submission_groups[i] = group found = True break if not found: print( f"submission group {group.name} does not exist in the original", file=sys.stderr, ) cluster.demote_from_submitter() sys.exit(1) logger.info("Updated submitter parameters from %s", submission_groups_file) jobs_to_resubmit = _get_jobs_to_resubmit(cluster, output, failed, missing, successful) updated_blocking_jobs_by_name = _update_with_blocking_jobs(jobs_to_resubmit, output) _reset_results(output, jobs_to_resubmit) cluster.prepare_for_resubmission(jobs_to_resubmit, updated_blocking_jobs_by_name) ret = 1 try: mgr = JobSubmitter.load(output) status = mgr.submit_jobs(cluster) if status == Status.IN_PROGRESS: print(f"Resubmitted {len(jobs_to_resubmit)} jobs in {output}") ret = 0 else: ret = status.value except Exception: logger.exception("Failed to resubmit jobs") raise finally: cluster.demote_from_submitter() sys.exit(ret)