def _get_manager_node_name(output_dir): output = {} job_id = os.environ[ "SLURM_JOB_ID"] # TODO: needs to be agnostic to HPC type check_run_command(f"jade cluster manager-node {output_dir} {job_id}", output) return output["stdout"].strip()
def _run_manager(job_name, output_dir, verbose, manager_script_and_args): filename = os.path.join(output_dir, f"run_multi_node_job_manager__{job_name}.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info("Run manager on %s: %s", socket.gethostname(), get_cli_string()) # Note that the manager receives its own hostname. output = {} check_run_command(f"jade cluster hostnames {output_dir}", output) hostnames = [x for x in output["stdout"].split() if x != ""] logger.info("Manager found %s hostnames: %s", len(hostnames), hostnames) cmd = " ".join(manager_script_and_args) logger.info("Run manager script [%s]", cmd) os.environ["JADE_OUTPUT_DIR"] = output_dir os.environ["JADE_COMPUTE_NODE_NAMES"] = " ".join(hostnames) start = time.time() ret = run_command(cmd) logger.info("Finished job. duration = %s seconds", time.time() - start) return ret
def test_submission_groups_per_node_setup(cleanup): config = create_config() config.submission_groups[1].submitter_params.node_setup_script = "node.sh" config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} --dry-run" check_run_command(cmd) config = create_config_from_file(Path(OUTPUT) / "config_batch_2.json") assert config.get_default_submission_group( ).submitter_params.node_setup_script == "node.sh"
def _set_hostnames(output_dir): output = {} job_id = os.environ[ "SLURM_JOB_ID"] # TODO: needs to be agnostic to HPC type check_run_command(f"jade cluster hostnames -j {job_id} {output_dir}", output) hostnames = [x for x in output["stdout"].split() if x != ""] logger.info("Found %s hostnames: %s", len(hostnames), hostnames) os.environ["JADE_OUTPUT_DIR"] = output_dir os.environ["JADE_COMPUTE_NODE_NAMES"] = " ".join(hostnames) return hostnames
def _get_tmpfs_size_gb(): output = {} check_run_command("df -h", output=output) # Output looks like this: # Filesystem Size Used Avail Use% Mounted on # tmpfs 378G 4.0K 378G 1% /dev/shm for line in output["stdout"].splitlines(): if line.endswith(TMPFS_MOUNT): return _parse_tmpfs_size_str(line) raise Exception( f"Did not find {TMPFS_MOUNT} in 'df -h' output: {output['stdout']}")
def test_submission_groups_per_node_setup(cleanup): # TODO: this test is no longer in the right place. Belongs in file testing job_config. config = create_config() config.node_setup_command = "node_setup.sh" config.node_teardown_command = "node_teardown.sh" config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} --dry-run" check_run_command(cmd) config = create_config_from_file(Path(OUTPUT) / "config_batch_2.json") assert config.node_setup_command == "node_setup.sh" assert config.node_teardown_command == "node_teardown.sh"
def test_job_order(generic_command_fixture): num_jobs = 50 commands = ["echo hello world"] * num_jobs with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration() for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == num_jobs job = config.get_job("1") for i in range(10, 15): job.blocked_by.add(i) config.get_job("2").blocked_by.add("1") config.get_job("21").blocked_by.add("30") config.get_job("41").blocked_by.add("50") config.dump(CONFIG_FILE) cmd = (f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} " "--per-node-batch-size=10 " "--max-nodes=4 " "--poll-interval=0.1 " f"--hpc-config {FAKE_HPC_CONFIG} " "--num-processes=10") check_run_command(cmd) check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01") result_summary = ResultsSummary(OUTPUT) results = result_summary.list_results() assert len(results) == num_jobs tracker = {x.name: x for x in results} for i in range(10, 15): assert tracker["1"].completion_time > tracker[str(i)].completion_time assert tracker["2"].completion_time > tracker["1"].completion_time assert tracker["21"].completion_time > tracker["30"].completion_time assert tracker["41"].completion_time > tracker["50"].completion_time # Verify that stats are summarized correctly with aggregation mode. stats_text = Path(OUTPUT) / "stats.txt" assert stats_text.exists() assert "Average" in stats_text.read_text() stats_json = Path(OUTPUT) / "stats_summary.json" assert stats_json.exists() stats = load_data(stats_json) assert stats assert "batch" in stats[0]
def run_worker(job, manager_node, output_dir, verbose, poll_interval=60): """Run a worker instance.""" logger.error("in worker manager_node=%s job=%s", manager_node, job.name) hostname = socket.gethostname() filename = os.path.join( output_dir, f"run_spark_job_worker__{hostname}__{job.name}.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info("Run worker: %s", get_cli_string()) # Ignore errors. Spark may not be running. run_command(job.model.spark_config.get_stop_worker()) # Give the master a head start. time.sleep(10) job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name logs_dir = job_output / "spark" / "logs" job_conf_dir = job_output / "spark" / "conf" workers_dir = job_output / "spark" / "workers" _set_env_variables(job, job_conf_dir, logs_dir) worker_memory = _get_worker_memory_str(job, is_master=False) cmd = _get_worker_command(job, manager_node, worker_memory) ret = 1 output = {} for _ in range(5): output.clear() logger.info("Run spark worker: [%s]", cmd) ret = run_command(cmd, output=output) if ret == 0: break if ret != 0: logger.error("Failed to start spark worker: %s: %s", ret, output) shutdown_file = _get_shutdown_file(job.name, output_dir) while not shutdown_file.exists(): logger.debug("sleep for %s seconds", poll_interval) time.sleep(poll_interval) logger.info("Detected shutdown.") check_run_command(job.model.spark_config.get_stop_worker()) if job.model.spark_config.collect_worker_logs: shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]), workers_dir / hostname) return 0
def run_multi_node_job(job_name, jade_runtime_output, verbose, manager_script_and_args): """Run a job across multiple nodes. The manager node will invoke manager_script_and_args.""" output = {} check_run_command(f"jade cluster am-i-manager {jade_runtime_output}", output) result = output["stdout"].strip() if result == "true": ret = run_manager(job_name, jade_runtime_output, verbose, manager_script_and_args) else: assert result == "false", result # The only purpose of this worker function is to keep the node allocation # alive. There are more efficient ways of doing this with HPC commands. # However, this procedure allows us to run the JADE JobRunner in the # background on each node and collect resource utilization statistics. ret = run_worker(job_name, jade_runtime_output, verbose) return ret
def test_submission_groups(cleanup): config = create_config() config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} -p 0.1" check_run_command(cmd) output_path = Path(OUTPUT) config_batch_files = list(output_path.glob("config_batch*.json")) assert len(config_batch_files) == 3 batch1 = load_data(output_path / "config_batch_1.json") assert len(batch1["jobs"]) == 3 batch2 = load_data(output_path / "config_batch_2.json") assert len(batch2["jobs"]) == 1 assert batch2["jobs"][0]["job_id"] == 4 batch3 = load_data(output_path / "config_batch_3.json") assert len(batch3["jobs"]) == 1 assert batch3["jobs"][0]["job_id"] == 5
def test_estimated_run_time(cleanup): # walltime is 240 minutes # 10-minute jobs # Each of 4 cores can each complete 24 jobs. 4 * 24 = 96 jobs # 100 jobs will take two batches. cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -t -n2 -q4" check_run_command(cmd) check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01") batch_config_1 = Path(OUTPUT) / "config_batch_1.json" assert os.path.exists(batch_config_1) batch_config_2 = Path(OUTPUT) / "config_batch_2.json" assert os.path.exists(batch_config_2) config1 = load_data(batch_config_1) assert len(config1["jobs"]) == 96 config2 = load_data(batch_config_2) assert len(config2["jobs"]) == 4
def list_active_nodes(self, job_id): out1 = {} # It's possible that 500 characters won't be enough, even with the compact format. # Compare the node count against the result to make sure we got all nodes. # There should be a better way to get this. check_run_command(f'squeue -j {job_id} --format="%5D %500N" -h', out1) result = out1["stdout"].strip().split() assert len(result) == 2, str(result) num_nodes = int(result[0]) nodes_compact = result[1] out2 = {} check_run_command(f'scontrol show hostnames "{nodes_compact}"', out2) nodes = [x for x in out2["stdout"].split("\n") if x != ""] if len(nodes) != num_nodes: raise Exception( f"Bug in parsing node names. Found={len(nodes)} Actual={num_nodes}" ) return nodes
def run_spark_cluster(job_name, jade_runtime_output, verbose, manager_script_and_args): """Create a Spark cluster across multiple nodes. The manager node will invoke the script.""" config = create_config_from_file(Path(jade_runtime_output) / CONFIG_FILE) job = config.get_job(job_name) _set_hostnames(jade_runtime_output) output = {} check_run_command(f"jade cluster am-i-manager {jade_runtime_output}", output) result = output["stdout"].strip() manager_node = _get_manager_node_name(jade_runtime_output) if result == "true": ret = run_cluster_master(job, manager_node, jade_runtime_output, verbose, manager_script_and_args) else: assert result == "false", result ret = run_worker(job, manager_node, jade_runtime_output, verbose) return ret
def get_job_stats(self, job_id): cmd = ( f"sacct -j {job_id} --format=JobID,JobName%20,state,start,end,Account,Partition%15,QOS" ) output = {} check_run_command(cmd, output=output) result = output["stdout"].strip().split("\n") if len(result) != 6: raise Exception( f"Unknown output for sacct: {result} length={len(result)}") # 8165902 COMPLETED 2022-01-16T12:10:37 2022-01-17T04:04:34 fields = result[2].split() if fields[0] != job_id: raise Exception(f"sacct returned unexpected job_id={fields[0]}") state = self._STATUSES.get(fields[2], HpcJobStatus.UNKNOWN) fmt = "%Y-%m-%dT%H:%M:%S" try: start = datetime.strptime(fields[3], fmt) except ValueError: logger.exception("Failed to parse start_time=%s", fields[3]) raise try: if fields[4] == "Unknown": end = fields[4] else: end = datetime.strptime(fields[4], fmt) except ValueError: logger.exception("Failed to parse end_time=%s", fields[4]) raise stats = HpcJobStats( hpc_job_id=job_id, name=fields[1], state=state, start=start, end=end, account=fields[5], partition=fields[6], qos=fields[7], ) return stats
def test_run_generic_commands(generic_command_fixture): commands = [ "ls .", "ls invalid-file-path", ] with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration() for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == 2 config.dump(CONFIG_FILE) cmds = ( f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -h {FAKE_HPC_CONFIG}", # Test with higher queue depth. This exercises the code paths but # doesn't actually verify the functionality. # The infrastructure to do that is currently lacking. TODO f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -q 32 -h {FAKE_HPC_CONFIG}", ) for cmd in cmds: check_run_command(cmd) check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01") assert list(Path(OUTPUT).glob("*.sh")) check_run_command(f"jade prune-files {OUTPUT}") assert not list(Path(OUTPUT).glob("*.sh"))
def run_jobs(self, distributed_submitter=True, verbose=False, num_processes=None): """Run the jobs. Parameters ---------- distributed_submitter : bool If True, make cluster updates. verbose : bool If True, enable debug logging. num_processes : int Number of processes to run in parallel; defaults to num CPUs Returns ------- Status """ logger.info("Run jobs.") scratch_dir = self._create_local_scratch() are_inputs_local = self._intf_type == HpcType.LOCAL try: config_file = self._config.serialize_for_execution( scratch_dir, are_inputs_local) jobs = self._generate_jobs(config_file, verbose) os.environ["JADE_RUNTIME_OUTPUT"] = self._output os.environ[ "JADE_SUBMISSION_GROUP"] = self._config.get_default_submission_group( ).name # Setting node_setup_script and node_shutdown_script are obsolete and will # eventually be deleted. group = self._config.get_default_submission_group() if group.submitter_params.node_setup_script is not None: cmd = f"{group.submitter_params.node_setup_script} {config_file} {self._output}" check_run_command(cmd) elif self._config.node_setup_command is not None: check_run_command(self._config.node_setup_command) result = self._run_jobs(jobs, num_processes=num_processes) if group.submitter_params.node_shutdown_script: cmd = f"{group.submitter_params.node_shutdown_script} {config_file} {self._output}" ret2 = run_command(cmd) if ret2 != 0: logger.error("Failed to run node shutdown script %s: %s", cmd, ret2) elif self._config.node_teardown_command is not None: start = time.time() ret2 = run_command(self._config.node_teardown_script) if ret2 != 0: logger.error( "Failed to run node shutdown script %s: %s", self._config.node_teardown_command, ret2, ) logger.info("Node teardown script duration = %s seconds", time.time() - start) logger.info("Completed %s jobs", len(jobs)) finally: shutil.rmtree(scratch_dir) if distributed_submitter and are_inputs_local: self._complete_hpc_job() return result
def test_estimated_run_time_too_long(job_too_long): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" with pytest.raises(ExecutionError): check_run_command(cmd)
def _run_cluster_master(job, manager_node, output_dir, verbose, manager_script_and_args): filename = os.path.join(output_dir, f"run_spark_cluster__{job.name}.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info("Run cluster master on %s job=%s: %s", socket.gethostname(), job.name, get_cli_string()) job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name if job_output.exists(): shutil.rmtree(job_output) job_output.mkdir(parents=True) events_dir = job_output / "spark" / "events" events_dir.mkdir(parents=True) logs_dir = job_output / "spark" / "logs" logs_dir.mkdir() workers_dir = job_output / "spark" / "workers" workers_dir.mkdir() # Make a job-specific conf directory because the log and event files need to be per-job. job_conf_dir = job_output / "spark" / "conf" shutil.copytree( Path(job.model.spark_config.conf_dir) / "conf", job_conf_dir) _fix_spark_conf_file(job_conf_dir, events_dir) _set_env_variables(job, job_conf_dir, logs_dir) # Ignore errors. Spark may not be running. run_command(job.model.spark_config.get_stop_worker()) run_command(job.model.spark_config.get_stop_history_server()) run_command(job.model.spark_config.get_stop_master()) # It would be better to start all workers from the master. Doing so would require that # Spark processes on the master node be able to ssh into the worker nodes. # I haven't spent the time to figure out to do that inside Singularity containers. master_cmd = job.model.spark_config.get_start_master() logger.info("Run spark master: [%s]", master_cmd) check_run_command(master_cmd) history_cmd = job.model.spark_config.get_start_history_server() logger.info("Run spark history server: [%s]", history_cmd) check_run_command(history_cmd) worker_memory = _get_worker_memory_str(job, is_master=True) worker_cmd = _get_worker_command(job, manager_node, memory=worker_memory) logger.info("Run spark worker: [%s]", worker_cmd) check_run_command(worker_cmd) # Wait for workers. # TODO: find a way to check programmatically with the rest api # or parse the logs time.sleep(15) args = list(manager_script_and_args) + [ _get_cluster(manager_node), str(job_output) ] if job.model.spark_config.run_user_script_inside_container: user_cmd = str(job.model.spark_config.get_run_user_script() ) + " " + " ".join(args) else: user_cmd = " ".join(args) logger.info("Run user script [%s]", user_cmd) start = time.time() ret = run_command(user_cmd) logger.info("Finished job. duration = %s seconds", time.time() - start) # Delay to ensure the history is saved. time.sleep(10) metrics = SparkMetrics("localhost", history=True) try: metrics.generate_metrics(job_output / "spark_metrics") except Exception: logger.exception("Failed to generate metrics") check_run_command(job.model.spark_config.get_stop_worker()) check_run_command(job.model.spark_config.get_stop_history_server()) check_run_command(job.model.spark_config.get_stop_master()) if job.model.spark_config.collect_worker_logs: shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]), workers_dir / socket.gethostname()) return ret
def test_dry_run(cleanup): cmd = f"jade submit-jobs --dry-run -h {SLURM_HPC_CONFIG} {CONFIG_FILE} --output={OUTPUT}" check_run_command(cmd)
def test_check_run_command(): """Test that check_run_command raises an exception.""" with pytest.raises(ExecutionError): check_run_command("ls invalid_test_file")