Ejemplo n.º 1
0
def test_resubmit_successful(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    check_run_command(cmd)
    check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01 -t2")
    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS

    check_run_command(
        f"jade config save-submission-groups {OUTPUT} -c {SG_FILE}")
    groups = load_data(SG_FILE)
    assert groups[0]["submitter_params"]["per_node_batch_size"] > NUM_COMMANDS
    groups[0]["submitter_params"]["per_node_batch_size"] = NUM_COMMANDS
    dump_data(groups, SG_FILE)

    check_run_command(f"{RESUBMIT_JOBS} {OUTPUT} -s {SG_FILE} --successful")
    check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS

    check_run_command(
        f"jade config save-submission-groups {OUTPUT} --force -c {SG_FILE}")
    groups = load_data(SG_FILE)
    assert groups[0]["submitter_params"]["per_node_batch_size"] == NUM_COMMANDS
Ejemplo n.º 2
0
def test_resubmit_missing(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    results.pop()
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = final_results["results"].pop()
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS - 1

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
Ejemplo n.º 3
0
def test_resubmit_failed(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    x = results[0]
    results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time)
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    final_results["results"][0]["return_code"] = 1
    final_results["results_summary"]["num_failed"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert summary.get_failed_results()[0].name == "1"

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
Ejemplo n.º 4
0
def create_config_from_previous_run(config_file,
                                    output,
                                    result_type="successful",
                                    **kwargs):
    """Create instance of a JobConfiguration from a previous config file,
    returning only those of the type given

    Parameters
    ----------
    config_file : str
        location of config
    output : str
        location of previous results
    result_type : string
        type of results

    Returns
    -------
    JobConfiguration

    Raises
    ------
    InvalidParameter
            Raised if result_type is not successful or failed

    """
    allowed_types = ["successful", "failed", "missing"]
    if result_type not in allowed_types:
        raise InvalidParameter(f"given result type invalid: {result_type}")

    config = deserialize_config(load_data(config_file))
    summary = ResultsSummary(output)
    results_of_type = []

    if result_type == "successful":
        results_of_type = summary.get_successful_results()
    elif result_type == "failed":
        results_of_type = summary.get_failed_results()
    elif result_type == "missing":
        results_of_type = summary.get_missing_jobs(config.iter_jobs())

    parameters = []
    # Note that both jobs and results have `.name`.
    for result in results_of_type:
        job_parameters = config.get_job(result.name)
        parameters.append(job_parameters)

    config.reconfigure_jobs(parameters)
    return deserialize_config(config.serialize(), **kwargs)
Ejemplo n.º 5
0
def test_cancel_on_failure_detect_by_submitter(cleanup):
    # HpcSubmitter handles the cancellation because the blocked job will be in the 2nd batch.
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b2"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == 1
    assert len(summary.get_failed_results()) == 1
    assert len(summary.get_canceled_results()) == 6
    results = summary.get_results_by_type()
    assert len(results["successful"]) == 1
    assert len(results["failed"]) == 1
    assert len(results["canceled"]) == 6
Ejemplo n.º 6
0
def test_cancel_on_failure_detect_by_runner(cleanup):
    # JobRunner handles the cancellation in JobQueue because the blocked job is in the batch
    # along with the blocking job.
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b8"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == 1
    assert len(summary.get_failed_results()) == 1
    assert len(summary.get_canceled_results()) == 6
    results = summary.get_results_by_type()
    assert len(results["successful"]) == 1
    assert len(results["failed"]) == 1
    assert len(results["canceled"]) == 6
Ejemplo n.º 7
0
def test_resubmit_with_blocking_jobs(basic_setup):
    num_commands = 7
    commands = ['echo "hello world"'] * num_commands
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    jobs = list(inputs.iter_jobs())
    # Set an inefficient ordering to make sure the resubmit algorithm is recursive.
    for i, job_param in enumerate(jobs):
        if i == 3:
            job_param.blocked_by = set([5])
        elif i == 4:
            job_param.blocked_by = set([7])
        elif i == 6:
            job_param.blocked_by = set([6])
        config.add_job(job_param)
    config.dump(CONFIG_FILE)
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    found = False
    for i, result in enumerate(results):
        if result.name == "7":
            results.pop(i)
            found = True
            break
    assert found
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = None
    for i, result in enumerate(final_results["results"]):
        if result["name"] == "7":
            missing = result
            final_results["results"].pop(i)
            break
    assert missing is not None
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == num_commands - 1
    first_batch = load_data(Path(OUTPUT) / "config_batch_1.json")
    assert len(first_batch["jobs"]) == num_commands

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == num_commands

    second_batch_file = Path(OUTPUT) / "config_batch_2.json"
    assert second_batch_file.exists()
    second_batch = load_data(second_batch_file)["jobs"]
    assert len(second_batch) == 3