def test_run_generic_commands(generic_command_fixture): commands = [ "ls .", "ls invalid-file-path", ] with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration() for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == 2 config.dump(CONFIG_FILE) cmds = ( f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -h {FAKE_HPC_CONFIG}", # Test with higher queue depth. This exercises the code paths but # doesn't actually verify the functionality. # The infrastructure to do that is currently lacking. TODO f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -q 32 -h {FAKE_HPC_CONFIG}", ) for cmd in cmds: check_run_command(cmd) check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01") assert list(Path(OUTPUT).glob("*.sh")) check_run_command(f"jade prune-files {OUTPUT}") assert not list(Path(OUTPUT).glob("*.sh"))
def create_config(): num_commands = 5 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) for i, job_param in enumerate(jobs): if i < 3: job_param.submission_group = "group1" else: job_param.submission_group = "group2" config.add_job(job_param) hpc_config1 = load_data(FAKE_HPC_CONFIG) hpc_config2 = copy.deepcopy(hpc_config1) hpc_config1["hpc"]["walltime"] = "1:00:00" hpc_config2["hpc"]["walltime"] = "5:00:00" params1 = SubmitterParams(hpc_config=hpc_config1, per_node_batch_size=3) params2 = SubmitterParams(hpc_config=hpc_config2, per_node_batch_size=1) group1 = SubmissionGroup(name="group1", submitter_params=params1) group2 = SubmissionGroup(name="group2", submitter_params=params2) config.append_submission_group(group1) config.append_submission_group(group2) return config
def test_job_order(generic_command_fixture): num_jobs = 50 commands = ["echo hello world"] * num_jobs with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration() for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == num_jobs job = config.get_job("1") for i in range(10, 15): job.blocked_by.add(i) config.get_job("2").blocked_by.add("1") config.get_job("21").blocked_by.add("30") config.get_job("41").blocked_by.add("50") config.dump(CONFIG_FILE) cmd = (f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} " "--per-node-batch-size=10 " "--max-nodes=4 " "--poll-interval=0.1 " f"--hpc-config {FAKE_HPC_CONFIG} " "--num-processes=10") check_run_command(cmd) check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01") result_summary = ResultsSummary(OUTPUT) results = result_summary.list_results() assert len(results) == num_jobs tracker = {x.name: x for x in results} for i in range(10, 15): assert tracker["1"].completion_time > tracker[str(i)].completion_time assert tracker["2"].completion_time > tracker["1"].completion_time assert tracker["21"].completion_time > tracker["30"].completion_time assert tracker["41"].completion_time > tracker["50"].completion_time # Verify that stats are summarized correctly with aggregation mode. stats_text = Path(OUTPUT) / "stats.txt" assert stats_text.exists() assert "Average" in stats_text.read_text() stats_json = Path(OUTPUT) / "stats_summary.json" assert stats_json.exists() stats = load_data(stats_json) assert stats assert "batch" in stats[0]
def test_job_configuration__shuffle_jobs(job_fixture): num_jobs = 10 with open(TEST_FILENAME, "w") as f_out: for i in range(num_jobs): f_out.write("echo hello world\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == num_jobs assert [x.name for x in config.iter_jobs()] == [str(x) for x in range(1, num_jobs + 1)] config.shuffle_jobs() assert [x.name for x in config.iter_jobs()] != [str(x) for x in range(1, num_jobs + 1)]
def test_job_configuration__check_job_dependencies_estimate(job_fixture): with open(TEST_FILENAME, "w") as f_out: f_out.write("echo hello world\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == 1 hpc_config = HpcConfig(**load_data(FAKE_HPC_CONFIG)) params = SubmitterParams(hpc_config=hpc_config, per_node_batch_size=0) with pytest.raises(InvalidConfiguration): config.check_job_dependencies(params)
def test_job_configuration__custom_names(job_fixture): num_jobs = 3 with open(TEST_FILENAME, "w") as f_out: for i in range(num_jobs): f_out.write("echo hello world\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) for i, job_param in enumerate(inputs.iter_jobs()): job_param.name = f"job_{i}" config.add_job(job_param) assert config.get_num_jobs() == num_jobs job = GenericCommandParameters(command="echo hello world", name="job_2") with pytest.raises(InvalidConfiguration): config.add_job(job)
def cleanup(): _do_cleanup() commands = [ 'echo "hello"', "ls invalid-path", 'echo "hello"', 'echo "hello"', 'echo "hello"', 'echo "hello"', 'echo "hello"', 'echo "hello"', ] with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration.auto_config( inputs, cancel_on_blocking_job_failure=True) config.get_job("3").set_blocking_jobs(set([2])) config.get_job("4").set_blocking_jobs(set([3])) config.get_job("5").set_blocking_jobs(set([4])) config.get_job("6").set_blocking_jobs(set([5])) config.get_job("7").set_blocking_jobs(set([6])) config.get_job("8").set_blocking_jobs(set([7])) config.dump(CONFIG_FILE) yield _do_cleanup()
def cleanup(): _do_cleanup() commands = ['echo "hello world"'] * NUM_COMMANDS with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration.auto_config(inputs) config.dump(CONFIG_FILE) yield _do_cleanup()
def test_job_configuration__check_job_dependencies_blocking(job_fixture): with open(TEST_FILENAME, "w") as f_out: f_out.write("echo hello world\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == 1 hpc_config = HpcConfig(**load_data(FAKE_HPC_CONFIG)) params = SubmitterParams(hpc_config=hpc_config) job = config.get_job("1") job.blocked_by.add("10") with pytest.raises(InvalidConfiguration): config.check_job_dependencies(params) # While we have this setup, verify that submit-jobs calls this function. config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} " "--poll-interval=.1 " ret = run_command(cmd) assert ret != 0
def test_try_add_blocked_jobs(cleanup): num_commands = 5 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) for i, job_param in enumerate(jobs): if i == num_commands - 1: job_param.blocked_by = set([1, 2, 3, 4]) config.add_job(job_param) config.dump(CONFIG_FILE) for option in ("--try-add-blocked-jobs", "--no-try-add-blocked-jobs"): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} -p 0.1 {option}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 events_file = os.path.join(OUTPUT, "submit_jobs_events.log") events_summary = EventsSummary(OUTPUT, preload=True) submit_events = events_summary.list_events(EVENT_NAME_HPC_SUBMIT) if option == "--try-add-blocked-jobs": assert len(submit_events) == 1 event = submit_events[0] assert event.data["batch_size"] == num_commands shutil.rmtree(OUTPUT) else: assert len(submit_events) == 2 event1 = submit_events[0] event2 = submit_events[1] assert event1.data["batch_size"] == num_commands - 1 assert event2.data["batch_size"] == 1
def job_too_long(): _do_cleanup() commands = ['echo "hello world"'] * NUM_COMMANDS with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration.auto_config(inputs, minutes_per_job=10) for i, job in enumerate(config.iter_jobs()): if i == 1: job.estimated_run_minutes = 1000 break config.dump(CONFIG_FILE) yield _do_cleanup()
def test_resubmit_with_blocking_jobs(basic_setup): num_commands = 7 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) # Set an inefficient ordering to make sure the resubmit algorithm is recursive. for i, job_param in enumerate(jobs): if i == 3: job_param.blocked_by = set([5]) elif i == 4: job_param.blocked_by = set([7]) elif i == 6: job_param.blocked_by = set([6]) config.add_job(job_param) config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 found = False for i, result in enumerate(results): if result.name == "7": results.pop(i) found = True break assert found agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = None for i, result in enumerate(final_results["results"]): if result["name"] == "7": missing = result final_results["results"].pop(i) break assert missing is not None final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == num_commands - 1 first_batch = load_data(Path(OUTPUT) / "config_batch_1.json") assert len(first_batch["jobs"]) == num_commands ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == num_commands second_batch_file = Path(OUTPUT) / "config_batch_2.json" assert second_batch_file.exists() second_batch = load_data(second_batch_file)["jobs"] assert len(second_batch) == 3