Ejemplo n.º 1
0
def test_run_generic_commands(generic_command_fixture):
    commands = [
        "ls .",
        "ls invalid-file-path",
    ]

    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration()
    for job_param in inputs.iter_jobs():
        config.add_job(job_param)
    assert config.get_num_jobs() == 2

    config.dump(CONFIG_FILE)

    cmds = (
        f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -h {FAKE_HPC_CONFIG}",
        # Test with higher queue depth. This exercises the code paths but
        # doesn't actually verify the functionality.
        # The infrastructure to do that is currently lacking. TODO
        f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -q 32 -h {FAKE_HPC_CONFIG}",
    )

    for cmd in cmds:
        check_run_command(cmd)
        check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01")

    assert list(Path(OUTPUT).glob("*.sh"))
    check_run_command(f"jade prune-files {OUTPUT}")
    assert not list(Path(OUTPUT).glob("*.sh"))
Ejemplo n.º 2
0
def create_config():
    num_commands = 5
    commands = ['echo "hello world"'] * num_commands
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    jobs = list(inputs.iter_jobs())
    for i, job_param in enumerate(jobs):
        if i < 3:
            job_param.submission_group = "group1"
        else:
            job_param.submission_group = "group2"
        config.add_job(job_param)

    hpc_config1 = load_data(FAKE_HPC_CONFIG)
    hpc_config2 = copy.deepcopy(hpc_config1)
    hpc_config1["hpc"]["walltime"] = "1:00:00"
    hpc_config2["hpc"]["walltime"] = "5:00:00"
    params1 = SubmitterParams(hpc_config=hpc_config1, per_node_batch_size=3)
    params2 = SubmitterParams(hpc_config=hpc_config2, per_node_batch_size=1)
    group1 = SubmissionGroup(name="group1", submitter_params=params1)
    group2 = SubmissionGroup(name="group2", submitter_params=params2)
    config.append_submission_group(group1)
    config.append_submission_group(group2)
    return config
Ejemplo n.º 3
0
def test_job_order(generic_command_fixture):
    num_jobs = 50
    commands = ["echo hello world"] * num_jobs

    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration()
    for job_param in inputs.iter_jobs():
        config.add_job(job_param)
    assert config.get_num_jobs() == num_jobs
    job = config.get_job("1")
    for i in range(10, 15):
        job.blocked_by.add(i)

    config.get_job("2").blocked_by.add("1")
    config.get_job("21").blocked_by.add("30")
    config.get_job("41").blocked_by.add("50")
    config.dump(CONFIG_FILE)

    cmd = (f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} "
           "--per-node-batch-size=10 "
           "--max-nodes=4 "
           "--poll-interval=0.1 "
           f"--hpc-config {FAKE_HPC_CONFIG} "
           "--num-processes=10")
    check_run_command(cmd)
    check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01")

    result_summary = ResultsSummary(OUTPUT)
    results = result_summary.list_results()
    assert len(results) == num_jobs
    tracker = {x.name: x for x in results}

    for i in range(10, 15):
        assert tracker["1"].completion_time > tracker[str(i)].completion_time

    assert tracker["2"].completion_time > tracker["1"].completion_time
    assert tracker["21"].completion_time > tracker["30"].completion_time
    assert tracker["41"].completion_time > tracker["50"].completion_time

    # Verify that stats are summarized correctly with aggregation mode.
    stats_text = Path(OUTPUT) / "stats.txt"
    assert stats_text.exists()
    assert "Average" in stats_text.read_text()
    stats_json = Path(OUTPUT) / "stats_summary.json"
    assert stats_json.exists()
    stats = load_data(stats_json)
    assert stats
    assert "batch" in stats[0]
Ejemplo n.º 4
0
def test_job_configuration__shuffle_jobs(job_fixture):
    num_jobs = 10
    with open(TEST_FILENAME, "w") as f_out:
        for i in range(num_jobs):
            f_out.write("echo hello world\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    for job_param in inputs.iter_jobs():
        config.add_job(job_param)
    assert config.get_num_jobs() == num_jobs
    assert [x.name for x in config.iter_jobs()] == [str(x) for x in range(1, num_jobs + 1)]
    config.shuffle_jobs()
    assert [x.name for x in config.iter_jobs()] != [str(x) for x in range(1, num_jobs + 1)]
Ejemplo n.º 5
0
def test_job_configuration__check_job_dependencies_estimate(job_fixture):
    with open(TEST_FILENAME, "w") as f_out:
        f_out.write("echo hello world\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    for job_param in inputs.iter_jobs():
        config.add_job(job_param)
    assert config.get_num_jobs() == 1

    hpc_config = HpcConfig(**load_data(FAKE_HPC_CONFIG))
    params = SubmitterParams(hpc_config=hpc_config, per_node_batch_size=0)
    with pytest.raises(InvalidConfiguration):
        config.check_job_dependencies(params)
Ejemplo n.º 6
0
def test_job_configuration__custom_names(job_fixture):
    num_jobs = 3
    with open(TEST_FILENAME, "w") as f_out:
        for i in range(num_jobs):
            f_out.write("echo hello world\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    for i, job_param in enumerate(inputs.iter_jobs()):
        job_param.name = f"job_{i}"
        config.add_job(job_param)
    assert config.get_num_jobs() == num_jobs
    job = GenericCommandParameters(command="echo hello world", name="job_2")
    with pytest.raises(InvalidConfiguration):
        config.add_job(job)
Ejemplo n.º 7
0
def cleanup():
    _do_cleanup()
    commands = [
        'echo "hello"',
        "ls invalid-path",
        'echo "hello"',
        'echo "hello"',
        'echo "hello"',
        'echo "hello"',
        'echo "hello"',
        'echo "hello"',
    ]
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration.auto_config(
        inputs, cancel_on_blocking_job_failure=True)
    config.get_job("3").set_blocking_jobs(set([2]))
    config.get_job("4").set_blocking_jobs(set([3]))
    config.get_job("5").set_blocking_jobs(set([4]))
    config.get_job("6").set_blocking_jobs(set([5]))
    config.get_job("7").set_blocking_jobs(set([6]))
    config.get_job("8").set_blocking_jobs(set([7]))
    config.dump(CONFIG_FILE)
    yield
    _do_cleanup()
Ejemplo n.º 8
0
def cleanup():
    _do_cleanup()
    commands = ['echo "hello world"'] * NUM_COMMANDS
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration.auto_config(inputs)
    config.dump(CONFIG_FILE)
    yield
    _do_cleanup()
Ejemplo n.º 9
0
def test_job_configuration__check_job_dependencies_blocking(job_fixture):
    with open(TEST_FILENAME, "w") as f_out:
        f_out.write("echo hello world\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    for job_param in inputs.iter_jobs():
        config.add_job(job_param)
    assert config.get_num_jobs() == 1

    hpc_config = HpcConfig(**load_data(FAKE_HPC_CONFIG))
    params = SubmitterParams(hpc_config=hpc_config)
    job = config.get_job("1")
    job.blocked_by.add("10")
    with pytest.raises(InvalidConfiguration):
        config.check_job_dependencies(params)

    # While we have this setup, verify that submit-jobs calls this function.
    config.dump(CONFIG_FILE)
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} " "--poll-interval=.1 "
    ret = run_command(cmd)
    assert ret != 0
Ejemplo n.º 10
0
def test_try_add_blocked_jobs(cleanup):
    num_commands = 5
    commands = ['echo "hello world"'] * num_commands
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    jobs = list(inputs.iter_jobs())
    for i, job_param in enumerate(jobs):
        if i == num_commands - 1:
            job_param.blocked_by = set([1, 2, 3, 4])
        config.add_job(job_param)
    config.dump(CONFIG_FILE)

    for option in ("--try-add-blocked-jobs", "--no-try-add-blocked-jobs"):
        cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} -p 0.1 {option}"
        ret = run_command(cmd)
        assert ret == 0
        ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
        assert ret == 0
        events_file = os.path.join(OUTPUT, "submit_jobs_events.log")
        events_summary = EventsSummary(OUTPUT, preload=True)
        submit_events = events_summary.list_events(EVENT_NAME_HPC_SUBMIT)
        if option == "--try-add-blocked-jobs":
            assert len(submit_events) == 1
            event = submit_events[0]
            assert event.data["batch_size"] == num_commands
            shutil.rmtree(OUTPUT)
        else:
            assert len(submit_events) == 2
            event1 = submit_events[0]
            event2 = submit_events[1]
            assert event1.data["batch_size"] == num_commands - 1
            assert event2.data["batch_size"] == 1
Ejemplo n.º 11
0
def job_too_long():
    _do_cleanup()
    commands = ['echo "hello world"'] * NUM_COMMANDS
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration.auto_config(inputs,
                                                     minutes_per_job=10)
    for i, job in enumerate(config.iter_jobs()):
        if i == 1:
            job.estimated_run_minutes = 1000
            break
    config.dump(CONFIG_FILE)
    yield
    _do_cleanup()
Ejemplo n.º 12
0
def test_resubmit_with_blocking_jobs(basic_setup):
    num_commands = 7
    commands = ['echo "hello world"'] * num_commands
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    jobs = list(inputs.iter_jobs())
    # Set an inefficient ordering to make sure the resubmit algorithm is recursive.
    for i, job_param in enumerate(jobs):
        if i == 3:
            job_param.blocked_by = set([5])
        elif i == 4:
            job_param.blocked_by = set([7])
        elif i == 6:
            job_param.blocked_by = set([6])
        config.add_job(job_param)
    config.dump(CONFIG_FILE)
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    found = False
    for i, result in enumerate(results):
        if result.name == "7":
            results.pop(i)
            found = True
            break
    assert found
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = None
    for i, result in enumerate(final_results["results"]):
        if result["name"] == "7":
            missing = result
            final_results["results"].pop(i)
            break
    assert missing is not None
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == num_commands - 1
    first_batch = load_data(Path(OUTPUT) / "config_batch_1.json")
    assert len(first_batch["jobs"]) == num_commands

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == num_commands

    second_batch_file = Path(OUTPUT) / "config_batch_2.json"
    assert second_batch_file.exists()
    second_batch = load_data(second_batch_file)["jobs"]
    assert len(second_batch) == 3