Beispiel #1
0
class JobAnalysis:
    """Provides functionality to analyze job results."""
    def __init__(self, output_dir, config):
        self._output_dir = output_dir
        self._config = config
        self._results = ResultsSummary(output_dir)

    def get_job(self, job_name):
        """Return the job from the config file with job_name.

        Parameters
        ----------
        job_name : str

        Returns
        -------
        namedtuple

        """
        return self._config.get_job(job_name)

    def get_successful_result(self, job_name):
        """Return the job result from the results file.
        Refer to :func:`~jade.result.ResultSummary.get_successful_result`.

        """
        return self._results.get_successful_result(job_name)

    def get_simulation(self, job_name):
        """Return a simulation object for the job_name.

        Parameters
        ----------
        job_name : str

        Returns
        -------
        JobExecutionInterface

        """
        # Make sure it was successful, otherwise it will raise.
        self.get_successful_result(job_name)

        job = self.get_job(job_name)
        simulation = self._config.create_from_result(
            job, os.path.join(self._output_dir, JOBS_OUTPUT_DIR))
        return simulation

    def list_results(self):
        """Return a list of Result objects."""
        return self._results.list_results()

    @property
    def output_dir(self):
        return self._output_dir

    def show_results(self, only_failed=False, only_successful=False):
        """Show the results in terminal."""
        return self._results.show_results(only_failed=only_failed,
                                          only_successful=only_successful)
Beispiel #2
0
def _get_jobs_to_resubmit(cluster, output, failed, missing):
    results = ResultsSummary(output)
    jobs_to_resubmit = []
    if failed:
        res = results.get_results_by_type()
        jobs_to_resubmit += res["canceled"]
        jobs_to_resubmit += res["failed"]
    if missing:
        jobs_to_resubmit += results.get_missing_jobs(cluster.iter_jobs())

    return {x.name for x in jobs_to_resubmit}
Beispiel #3
0
def show_results(failed, output, successful, post_process,
                 job_name, verbose):
    """Shows the results of a batch of jobs."""
    level = logging.DEBUG if verbose else logging.WARNING
    setup_logging("show_results", None, console_level=level)

    if post_process:
        JobPostProcess.show_results(output, job_name)
        sys.exit(0)

    results = ResultsSummary(output)
    results.show_results(only_failed=failed, only_successful=successful)
Beispiel #4
0
def test_job_order(generic_command_fixture):
    num_jobs = 50
    commands = ["echo hello world"] * num_jobs

    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration()
    for job_param in inputs.iter_jobs():
        config.add_job(job_param)
    assert config.get_num_jobs() == num_jobs
    job = config.get_job("1")
    for i in range(10, 15):
        job.blocked_by.add(i)

    config.get_job("2").blocked_by.add("1")
    config.get_job("21").blocked_by.add("30")
    config.get_job("41").blocked_by.add("50")
    config.dump(CONFIG_FILE)

    cmd = (f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} "
           "--per-node-batch-size=10 "
           "--max-nodes=4 "
           "--poll-interval=0.1 "
           f"--hpc-config {FAKE_HPC_CONFIG} "
           "--num-processes=10")
    check_run_command(cmd)
    check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01")

    result_summary = ResultsSummary(OUTPUT)
    results = result_summary.list_results()
    assert len(results) == num_jobs
    tracker = {x.name: x for x in results}

    for i in range(10, 15):
        assert tracker["1"].completion_time > tracker[str(i)].completion_time

    assert tracker["2"].completion_time > tracker["1"].completion_time
    assert tracker["21"].completion_time > tracker["30"].completion_time
    assert tracker["41"].completion_time > tracker["50"].completion_time

    # Verify that stats are summarized correctly with aggregation mode.
    stats_text = Path(OUTPUT) / "stats.txt"
    assert stats_text.exists()
    assert "Average" in stats_text.read_text()
    stats_json = Path(OUTPUT) / "stats_summary.json"
    assert stats_json.exists()
    stats = load_data(stats_json)
    assert stats
    assert "batch" in stats[0]
Beispiel #5
0
def submit_jobs(config_file, per_node_batch_size, hpc_config, local, max_nodes,
                output, poll_interval, num_processes, rotate_logs, verbose,
                restart_failed, restart_missing, reports,
                try_add_blocked_jobs):
    """Submits jobs for execution, locally or on HPC."""
    os.makedirs(output, exist_ok=True)

    previous_results = []

    if restart_failed:
        failed_job_config = create_config_from_previous_run(
            config_file, output, result_type='failed')
        previous_results = ResultsSummary(output).get_successful_results()
        config_file = "failed_job_inputs.json"
        failed_job_config.dump(config_file)

    if restart_missing:
        missing_job_config = create_config_from_previous_run(
            config_file, output, result_type='missing')
        config_file = "missing_job_inputs.json"
        missing_job_config.dump(config_file)
        previous_results = ResultsSummary(output).list_results()

    if rotate_logs:
        rotate_filenames(output, ".log")

    filename = os.path.join(output, "submit_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__, filename, file_level=level, console_level=level)
    logger.info(get_cli_string())

    event_file = os.path.join(output, "submit_jobs_events.log")
    # This effectively means no console logging.
    setup_logging("event",
                  event_file,
                  console_level=logging.ERROR,
                  file_level=logging.INFO)

    mgr = JobSubmitter(config_file, hpc_config=hpc_config, output=output)
    ret = mgr.submit_jobs(
        per_node_batch_size=per_node_batch_size,
        max_nodes=max_nodes,
        force_local=local,
        verbose=verbose,
        num_processes=num_processes,
        poll_interval=poll_interval,
        previous_results=previous_results,
        reports=reports,
        try_add_blocked_jobs=try_add_blocked_jobs,
    )

    sys.exit(ret.value)
Beispiel #6
0
def test_resubmit_successful(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    check_run_command(cmd)
    check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01 -t2")
    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS

    check_run_command(
        f"jade config save-submission-groups {OUTPUT} -c {SG_FILE}")
    groups = load_data(SG_FILE)
    assert groups[0]["submitter_params"]["per_node_batch_size"] > NUM_COMMANDS
    groups[0]["submitter_params"]["per_node_batch_size"] = NUM_COMMANDS
    dump_data(groups, SG_FILE)

    check_run_command(f"{RESUBMIT_JOBS} {OUTPUT} -s {SG_FILE} --successful")
    check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS

    check_run_command(
        f"jade config save-submission-groups {OUTPUT} --force -c {SG_FILE}")
    groups = load_data(SG_FILE)
    assert groups[0]["submitter_params"]["per_node_batch_size"] == NUM_COMMANDS
Beispiel #7
0
def create_config_from_previous_run(config_file,
                                    output,
                                    result_type="successful",
                                    **kwargs):
    """Create instance of a JobConfiguration from a previous config file,
    returning only those of the type given

    Parameters
    ----------
    config_file : str
        location of config
    output : str
        location of previous results
    result_type : string
        type of results

    Returns
    -------
    JobConfiguration

    Raises
    ------
    InvalidParameter
            Raised if result_type is not successful or failed

    """
    allowed_types = ["successful", "failed", "missing"]
    if result_type not in allowed_types:
        raise InvalidParameter(f"given result type invalid: {result_type}")

    config = deserialize_config(load_data(config_file))
    summary = ResultsSummary(output)
    results_of_type = []

    if result_type == "successful":
        results_of_type = summary.get_successful_results()
    elif result_type == "failed":
        results_of_type = summary.get_failed_results()
    elif result_type == "missing":
        results_of_type = summary.get_missing_jobs(config.iter_jobs())

    parameters = []
    # Note that both jobs and results have `.name`.
    for result in results_of_type:
        job_parameters = config.get_job(result.name)
        parameters.append(job_parameters)

    config.reconfigure_jobs(parameters)
    return deserialize_config(config.serialize(), **kwargs)
Beispiel #8
0
def test_resubmit_missing(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    results.pop()
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = final_results["results"].pop()
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS - 1

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
def test_job_order(generic_command_fixture):
    num_jobs = 50
    commands = ["echo hello world"] * num_jobs

    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    for job_param in inputs.iter_jobs():
        config.add_job(job_param)
    assert config.get_num_jobs() == num_jobs
    job = config.get_job("1")
    for i in range(10, 15):
        job.blocked_by.add(i)

    config.get_job("2").blocked_by.add("1")
    config.get_job("21").blocked_by.add("30")
    config.get_job("41").blocked_by.add("50")
    config.dump(CONFIG_FILE)

    os.environ["FAKE_HPC_CLUSTER"] = "True"

    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} " \
        "--per-node-batch-size=10 " \
        "--max-nodes=4 " \
        "--poll-interval=.1 " \
        "--num-processes=10"
    ret = run_command(cmd)
    assert ret == 0

    result_summary = ResultsSummary(OUTPUT)
    results = result_summary.list_results()
    assert len(results) == num_jobs
    tracker = {}
    for result in results:
        tracker[result.name] = result

    for i in range(10, 15):
        assert tracker["1"].completion_time > tracker[str(i)].completion_time

    assert tracker["2"].completion_time > tracker["1"].completion_time
    assert tracker["21"].completion_time > tracker["30"].completion_time
    assert tracker["41"].completion_time > tracker["50"].completion_time
Beispiel #10
0
def test_resubmit_failed(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    x = results[0]
    results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time)
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    final_results["results"][0]["return_code"] = 1
    final_results["results_summary"]["num_failed"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert summary.get_failed_results()[0].name == "1"

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
Beispiel #11
0
def show_results(failed, output, successful, post_process, job_name, verbose):
    """Shows the results of a batch of jobs."""
    if not Path(output).exists():
        print(f"{output} does not exist", file=sys.stderr)
        sys.exit(1)

    level = logging.DEBUG if verbose else logging.WARNING
    setup_logging("show_results", None, console_level=level)

    if post_process:
        JobPostProcess.show_results(output, job_name)
        sys.exit(0)

    try:
        results = ResultsSummary(output)
    except InvalidConfiguration:
        print(
            f"No results are available in {output}. To check status of in-progress jobs run "
            f"'jade show-status -o {output}'",
            file=sys.stderr,
        )
        sys.exit(1)

    results.show_results(only_failed=failed, only_successful=successful)
Beispiel #12
0
def test_cancel_on_failure_detect_by_submitter(cleanup):
    # HpcSubmitter handles the cancellation because the blocked job will be in the 2nd batch.
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b2"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == 1
    assert len(summary.get_failed_results()) == 1
    assert len(summary.get_canceled_results()) == 6
    results = summary.get_results_by_type()
    assert len(results["successful"]) == 1
    assert len(results["failed"]) == 1
    assert len(results["canceled"]) == 6
Beispiel #13
0
def test_cancel_on_failure_detect_by_runner(cleanup):
    # JobRunner handles the cancellation in JobQueue because the blocked job is in the batch
    # along with the blocking job.
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b8"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == 1
    assert len(summary.get_failed_results()) == 1
    assert len(summary.get_canceled_results()) == 6
    results = summary.get_results_by_type()
    assert len(results["successful"]) == 1
    assert len(results["failed"]) == 1
    assert len(results["canceled"]) == 6
Beispiel #14
0
def test_resubmit_with_blocking_jobs(basic_setup):
    num_commands = 7
    commands = ['echo "hello world"'] * num_commands
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    jobs = list(inputs.iter_jobs())
    # Set an inefficient ordering to make sure the resubmit algorithm is recursive.
    for i, job_param in enumerate(jobs):
        if i == 3:
            job_param.blocked_by = set([5])
        elif i == 4:
            job_param.blocked_by = set([7])
        elif i == 6:
            job_param.blocked_by = set([6])
        config.add_job(job_param)
    config.dump(CONFIG_FILE)
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    found = False
    for i, result in enumerate(results):
        if result.name == "7":
            results.pop(i)
            found = True
            break
    assert found
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = None
    for i, result in enumerate(final_results["results"]):
        if result["name"] == "7":
            missing = result
            final_results["results"].pop(i)
            break
    assert missing is not None
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == num_commands - 1
    first_batch = load_data(Path(OUTPUT) / "config_batch_1.json")
    assert len(first_batch["jobs"]) == num_commands

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == num_commands

    second_batch_file = Path(OUTPUT) / "config_batch_2.json"
    assert second_batch_file.exists()
    second_batch = load_data(second_batch_file)["jobs"]
    assert len(second_batch) == 3
Beispiel #15
0
 def __init__(self, output_dir, config):
     self._output_dir = output_dir
     self._config = config
     self._results = ResultsSummary(output_dir)