class JobAnalysis: """Provides functionality to analyze job results.""" def __init__(self, output_dir, config): self._output_dir = output_dir self._config = config self._results = ResultsSummary(output_dir) def get_job(self, job_name): """Return the job from the config file with job_name. Parameters ---------- job_name : str Returns ------- namedtuple """ return self._config.get_job(job_name) def get_successful_result(self, job_name): """Return the job result from the results file. Refer to :func:`~jade.result.ResultSummary.get_successful_result`. """ return self._results.get_successful_result(job_name) def get_simulation(self, job_name): """Return a simulation object for the job_name. Parameters ---------- job_name : str Returns ------- JobExecutionInterface """ # Make sure it was successful, otherwise it will raise. self.get_successful_result(job_name) job = self.get_job(job_name) simulation = self._config.create_from_result( job, os.path.join(self._output_dir, JOBS_OUTPUT_DIR)) return simulation def list_results(self): """Return a list of Result objects.""" return self._results.list_results() @property def output_dir(self): return self._output_dir def show_results(self, only_failed=False, only_successful=False): """Show the results in terminal.""" return self._results.show_results(only_failed=only_failed, only_successful=only_successful)
def _get_jobs_to_resubmit(cluster, output, failed, missing): results = ResultsSummary(output) jobs_to_resubmit = [] if failed: res = results.get_results_by_type() jobs_to_resubmit += res["canceled"] jobs_to_resubmit += res["failed"] if missing: jobs_to_resubmit += results.get_missing_jobs(cluster.iter_jobs()) return {x.name for x in jobs_to_resubmit}
def show_results(failed, output, successful, post_process, job_name, verbose): """Shows the results of a batch of jobs.""" level = logging.DEBUG if verbose else logging.WARNING setup_logging("show_results", None, console_level=level) if post_process: JobPostProcess.show_results(output, job_name) sys.exit(0) results = ResultsSummary(output) results.show_results(only_failed=failed, only_successful=successful)
def test_job_order(generic_command_fixture): num_jobs = 50 commands = ["echo hello world"] * num_jobs with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration() for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == num_jobs job = config.get_job("1") for i in range(10, 15): job.blocked_by.add(i) config.get_job("2").blocked_by.add("1") config.get_job("21").blocked_by.add("30") config.get_job("41").blocked_by.add("50") config.dump(CONFIG_FILE) cmd = (f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} " "--per-node-batch-size=10 " "--max-nodes=4 " "--poll-interval=0.1 " f"--hpc-config {FAKE_HPC_CONFIG} " "--num-processes=10") check_run_command(cmd) check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01") result_summary = ResultsSummary(OUTPUT) results = result_summary.list_results() assert len(results) == num_jobs tracker = {x.name: x for x in results} for i in range(10, 15): assert tracker["1"].completion_time > tracker[str(i)].completion_time assert tracker["2"].completion_time > tracker["1"].completion_time assert tracker["21"].completion_time > tracker["30"].completion_time assert tracker["41"].completion_time > tracker["50"].completion_time # Verify that stats are summarized correctly with aggregation mode. stats_text = Path(OUTPUT) / "stats.txt" assert stats_text.exists() assert "Average" in stats_text.read_text() stats_json = Path(OUTPUT) / "stats_summary.json" assert stats_json.exists() stats = load_data(stats_json) assert stats assert "batch" in stats[0]
def submit_jobs(config_file, per_node_batch_size, hpc_config, local, max_nodes, output, poll_interval, num_processes, rotate_logs, verbose, restart_failed, restart_missing, reports, try_add_blocked_jobs): """Submits jobs for execution, locally or on HPC.""" os.makedirs(output, exist_ok=True) previous_results = [] if restart_failed: failed_job_config = create_config_from_previous_run( config_file, output, result_type='failed') previous_results = ResultsSummary(output).get_successful_results() config_file = "failed_job_inputs.json" failed_job_config.dump(config_file) if restart_missing: missing_job_config = create_config_from_previous_run( config_file, output, result_type='missing') config_file = "missing_job_inputs.json" missing_job_config.dump(config_file) previous_results = ResultsSummary(output).list_results() if rotate_logs: rotate_filenames(output, ".log") filename = os.path.join(output, "submit_jobs.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level) logger.info(get_cli_string()) event_file = os.path.join(output, "submit_jobs_events.log") # This effectively means no console logging. setup_logging("event", event_file, console_level=logging.ERROR, file_level=logging.INFO) mgr = JobSubmitter(config_file, hpc_config=hpc_config, output=output) ret = mgr.submit_jobs( per_node_batch_size=per_node_batch_size, max_nodes=max_nodes, force_local=local, verbose=verbose, num_processes=num_processes, poll_interval=poll_interval, previous_results=previous_results, reports=reports, try_add_blocked_jobs=try_add_blocked_jobs, ) sys.exit(ret.value)
def test_resubmit_successful(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" check_run_command(cmd) check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01 -t2") summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS check_run_command( f"jade config save-submission-groups {OUTPUT} -c {SG_FILE}") groups = load_data(SG_FILE) assert groups[0]["submitter_params"]["per_node_batch_size"] > NUM_COMMANDS groups[0]["submitter_params"]["per_node_batch_size"] = NUM_COMMANDS dump_data(groups, SG_FILE) check_run_command(f"{RESUBMIT_JOBS} {OUTPUT} -s {SG_FILE} --successful") check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01") summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS check_run_command( f"jade config save-submission-groups {OUTPUT} --force -c {SG_FILE}") groups = load_data(SG_FILE) assert groups[0]["submitter_params"]["per_node_batch_size"] == NUM_COMMANDS
def create_config_from_previous_run(config_file, output, result_type="successful", **kwargs): """Create instance of a JobConfiguration from a previous config file, returning only those of the type given Parameters ---------- config_file : str location of config output : str location of previous results result_type : string type of results Returns ------- JobConfiguration Raises ------ InvalidParameter Raised if result_type is not successful or failed """ allowed_types = ["successful", "failed", "missing"] if result_type not in allowed_types: raise InvalidParameter(f"given result type invalid: {result_type}") config = deserialize_config(load_data(config_file)) summary = ResultsSummary(output) results_of_type = [] if result_type == "successful": results_of_type = summary.get_successful_results() elif result_type == "failed": results_of_type = summary.get_failed_results() elif result_type == "missing": results_of_type = summary.get_missing_jobs(config.iter_jobs()) parameters = [] # Note that both jobs and results have `.name`. for result in results_of_type: job_parameters = config.get_job(result.name) parameters.append(job_parameters) config.reconfigure_jobs(parameters) return deserialize_config(config.serialize(), **kwargs)
def test_resubmit_missing(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 results.pop() agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = final_results["results"].pop() final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS - 1 ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def test_job_order(generic_command_fixture): num_jobs = 50 commands = ["echo hello world"] * num_jobs with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == num_jobs job = config.get_job("1") for i in range(10, 15): job.blocked_by.add(i) config.get_job("2").blocked_by.add("1") config.get_job("21").blocked_by.add("30") config.get_job("41").blocked_by.add("50") config.dump(CONFIG_FILE) os.environ["FAKE_HPC_CLUSTER"] = "True" cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} " \ "--per-node-batch-size=10 " \ "--max-nodes=4 " \ "--poll-interval=.1 " \ "--num-processes=10" ret = run_command(cmd) assert ret == 0 result_summary = ResultsSummary(OUTPUT) results = result_summary.list_results() assert len(results) == num_jobs tracker = {} for result in results: tracker[result.name] = result for i in range(10, 15): assert tracker["1"].completion_time > tracker[str(i)].completion_time assert tracker["2"].completion_time > tracker["1"].completion_time assert tracker["21"].completion_time > tracker["30"].completion_time assert tracker["41"].completion_time > tracker["50"].completion_time
def test_resubmit_failed(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 x = results[0] results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time) agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) final_results["results"][0]["return_code"] = 1 final_results["results_summary"]["num_failed"] = 1 final_results["results_summary"]["num_successful"] -= 1 dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert summary.get_failed_results()[0].name == "1" ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def show_results(failed, output, successful, post_process, job_name, verbose): """Shows the results of a batch of jobs.""" if not Path(output).exists(): print(f"{output} does not exist", file=sys.stderr) sys.exit(1) level = logging.DEBUG if verbose else logging.WARNING setup_logging("show_results", None, console_level=level) if post_process: JobPostProcess.show_results(output, job_name) sys.exit(0) try: results = ResultsSummary(output) except InvalidConfiguration: print( f"No results are available in {output}. To check status of in-progress jobs run " f"'jade show-status -o {output}'", file=sys.stderr, ) sys.exit(1) results.show_results(only_failed=failed, only_successful=successful)
def test_cancel_on_failure_detect_by_submitter(cleanup): # HpcSubmitter handles the cancellation because the blocked job will be in the 2nd batch. cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b2" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == 1 assert len(summary.get_failed_results()) == 1 assert len(summary.get_canceled_results()) == 6 results = summary.get_results_by_type() assert len(results["successful"]) == 1 assert len(results["failed"]) == 1 assert len(results["canceled"]) == 6
def test_cancel_on_failure_detect_by_runner(cleanup): # JobRunner handles the cancellation in JobQueue because the blocked job is in the batch # along with the blocking job. cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b8" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == 1 assert len(summary.get_failed_results()) == 1 assert len(summary.get_canceled_results()) == 6 results = summary.get_results_by_type() assert len(results["successful"]) == 1 assert len(results["failed"]) == 1 assert len(results["canceled"]) == 6
def test_resubmit_with_blocking_jobs(basic_setup): num_commands = 7 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) # Set an inefficient ordering to make sure the resubmit algorithm is recursive. for i, job_param in enumerate(jobs): if i == 3: job_param.blocked_by = set([5]) elif i == 4: job_param.blocked_by = set([7]) elif i == 6: job_param.blocked_by = set([6]) config.add_job(job_param) config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 found = False for i, result in enumerate(results): if result.name == "7": results.pop(i) found = True break assert found agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = None for i, result in enumerate(final_results["results"]): if result["name"] == "7": missing = result final_results["results"].pop(i) break assert missing is not None final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == num_commands - 1 first_batch = load_data(Path(OUTPUT) / "config_batch_1.json") assert len(first_batch["jobs"]) == num_commands ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == num_commands second_batch_file = Path(OUTPUT) / "config_batch_2.json" assert second_batch_file.exists() second_batch = load_data(second_batch_file)["jobs"] assert len(second_batch) == 3
def __init__(self, output_dir, config): self._output_dir = output_dir self._config = config self._results = ResultsSummary(output_dir)