Example #1
0
def test_resubmit_failed(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    x = results[0]
    results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time)
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    final_results["results"][0]["return_code"] = 1
    final_results["results_summary"]["num_failed"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert summary.get_failed_results()[0].name == "1"

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
Example #2
0
def test_resubmit_missing(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    results.pop()
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = final_results["results"].pop()
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS - 1

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
Example #3
0
    def _handle_completion(self, cluster):
        result = Status.GOOD
        self._results = ResultsAggregator.list_results(self._output)
        if len(self._results) != self._config.get_num_jobs():
            finished_jobs = {x.name for x in self._results}
            all_jobs = {x.name for x in self._config.iter_jobs()}
            missing_jobs = sorted(all_jobs.difference(finished_jobs))
            logger.error(
                "Error in result totals. num_results=%s total_num_jobs=%s",
                len(self._results),
                self._config.get_num_jobs(),
            )
            logger.error(
                "These jobs did not finish: %s. Check for process crashes or HPC timeouts.",
                missing_jobs,
            )
            result = Status.ERROR
        else:
            missing_jobs = []

        self.write_results_summary(RESULTS_FILE, missing_jobs)
        self._log_error_log_messages(self._output)

        bytes_consumed = get_directory_size_bytes(self._output,
                                                  recursive=False)
        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="main output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)

        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_SUBMIT_COMPLETED,
            message="job submission completed",
            num_jobs=self.get_num_jobs(),
        )
        log_event(event)

        group = self._config.get_default_submission_group()
        if group.submitter_params.generate_reports:
            self.generate_reports(self._output,
                                  group.submitter_params.resource_monitor_type)
        cluster.mark_complete()

        if cluster.config.pipeline_stage_num is not None:
            # The pipeline directory must be the one above this one.
            pipeline_dir = os.path.dirname(self._output)
            next_stage = cluster.config.pipeline_stage_num + 1
            cmd = (f"jade pipeline submit-next-stage {pipeline_dir} "
                   f"--stage-num={next_stage} "
                   f"--return-code={result.value}")
            run_command(cmd)

        return result
Example #4
0
def test_demo_extension(test_data_dir):
    """Should create a config.json file"""
    config_file = os.path.join(test_data_dir, "config.json")

    if os.path.exists(config_file):
        os.remove(config_file)

    base = os.path.join(JADE_PATH, "extensions", "demo")
    create_demo_config = os.path.join(base, "create_demo_config.sh")
    create_merge_config = os.path.join(base, "create_merge_pred_gdp.py")
    config_file = os.path.join(test_data_dir, "pipeline.json")
    output = os.path.join(test_data_dir, "output")
    if os.path.exists(output):
        shutil.rmtree(output)

    try:
        cmd = (
            f"jade pipeline create {create_demo_config} {create_merge_config} -c {config_file} -l"
        )
        returncode = run_command(cmd=cmd)
        assert returncode == 0
        assert os.path.exists(config_file)

        returncode = run_command(
            f"jade pipeline submit {config_file} -o {output}")
        assert returncode == 0

        output_stage1 = os.path.join(output, "output-stage1")
        output_stage2 = os.path.join(output, "output-stage2")
        assert os.path.exists(output)
        assert os.path.exists(output_stage1)
        assert os.path.exists(output_stage2)

        job_outputs = os.path.join(output_stage1, "job-outputs")

        for country in ("australia", "brazil", "united_states"):
            results = os.listdir(os.path.join(job_outputs, country))
            assert "result.csv" in results
            assert "result.png" in results
            assert "summary.toml" in results

        pred_gdp_file = os.path.join(output_stage2, "pred_gdp.csv")
        assert os.path.exists(pred_gdp_file)
        df = pd.read_csv(pred_gdp_file)
        assert "year" in df.columns
        assert "brazil" in df.columns
        assert "united_states" in df.columns
        assert "australia" in df.columns

    finally:
        if os.path.exists(output):
            shutil.rmtree(output)

        if os.path.exists(config_file):
            os.remove(config_file)

        if os.path.exists(PRED_GDP_COMMANDS_FILE):
            os.remove(PRED_GDP_COMMANDS_FILE)
Example #5
0
def test_stats__bytes_consumed(example_output):
    output = {}
    ret = run_command(f"jade stats bytes-consumed -o {example_output}", output)
    assert ret == 0
    assert len(output["stdout"]) > 0

    ret = run_command(f"jade stats bytes-consumed --no-human-readable -o {example_output}", output)
    assert ret == 0
    bytes_consumed = int(output["stdout"].strip())
    assert bytes_consumed > 0
Example #6
0
def test_stats__exec_time(example_output):
    output = {}
    ret = run_command(f"jade stats exec-time -o {example_output}", output)
    assert ret == 0
    assert len(output["stdout"]) > 0

    ret = run_command(f"jade stats exec-time --no-human-readable -o {example_output}", output)
    assert ret == 0
    exec_time = float(output["stdout"].strip())
    assert exec_time > 0
Example #7
0
def test_config__show(cleanup):
    ret = run_command(f"jade auto-config demo tests/data/demo -c {CONFIG1}")
    assert ret == 0
    assert os.path.exists(CONFIG1)

    output = {}
    ret = run_command(f"jade config show {CONFIG1}", output=output)
    assert ret == 0

    for country in ("australia", "brazil", "united_states"):
        assert country in output["stdout"]
Example #8
0
def test_config__filter_show_only(cleanup):
    ret = run_command(f"jade auto-config demo tests/data/demo -c {CONFIG1}")
    assert ret == 0
    assert os.path.exists(CONFIG1)

    output = {}
    ret = run_command(f"jade config filter {CONFIG1} -f country brazil", output=output)
    assert ret == 0
    assert not os.path.exists(CONFIG2)

    assert "brazil" in output["stdout"]
Example #9
0
def test_config__assign_blocked_by(cleanup):
    config = GenericCommandConfiguration()
    base_cmd = "bash my_script.sh"
    regular_job_names = []
    for i in range(1, 4):
        cmd = base_cmd + " " + str(i)
        name = f"job_{i}"
        job = GenericCommandParameters(
            command=cmd,
            name=name,
            append_job_name=True,
            append_output_dir=True,
        )
        config.add_job(job)
        regular_job_names.append(name)

    pp_name = "post_process"
    post_process_job = GenericCommandParameters(
        command="bash run_post_process.sh",
        name=pp_name,
        append_job_name=True,
        append_output_dir=True,
    )
    config.add_job(post_process_job)
    config_file = CONFIG1
    config.dump(config_file, indent=2)

    ret = run_command(
        f"jade config assign-blocked-by {CONFIG1} 3 -o {CONFIG2}")
    assert ret == 0
    assert os.path.exists(CONFIG2)

    config = load_data(CONFIG2)
    assert sorted(config["jobs"][3]["blocked_by"]) == sorted(regular_job_names)

    os.remove(CONFIG2)
    ret = run_command(
        f"jade config assign-blocked-by {CONFIG1} 3 1 2 -o {CONFIG2}")
    assert ret == 0
    assert os.path.exists(CONFIG2)
    config = load_data(CONFIG2)
    expected = [regular_job_names[1], regular_job_names[2]]
    assert sorted(config["jobs"][3]["blocked_by"]) == sorted(expected)

    # Include the pp job in blocking-job-indexes.
    ret = run_command(
        f"jade config assign-blocked-by {CONFIG1} 3 1 2 3 -o {CONFIG2}")
    assert ret != 0

    # Invalid job index
    ret = run_command(
        f"jade config assign-blocked-by {CONFIG1} 47 1 2 -o {CONFIG2}")
    assert ret != 0
Example #10
0
def test_config__filter_copy(cleanup):
    ret = run_command(f"jade auto-config demo tests/data/demo -c {CONFIG1}")
    assert ret == 0
    assert os.path.exists(CONFIG1)

    ret = run_command(f"jade config filter {CONFIG1} -o {CONFIG2}")
    assert ret == 0
    assert os.path.exists(CONFIG2)

    config1 = load_data(CONFIG1)
    config2 = load_data(CONFIG2)
    assert config1 == config2
Example #11
0
def test_config__filter_range(cleanup):
    ret = run_command(f"jade auto-config demo tests/data/demo -c {CONFIG1}")
    assert ret == 0
    assert os.path.exists(CONFIG1)

    ret = run_command(f"jade config filter {CONFIG1} -o {CONFIG2} 0 1")
    assert ret == 0
    assert os.path.exists(CONFIG2)

    config1 = load_data(CONFIG1)
    config2 = load_data(CONFIG2)
    assert config2["jobs"] == [config1["jobs"][0], config1["jobs"][1]]
Example #12
0
def test_collect_stats():
    output_dir = os.path.join(tempfile.gettempdir(), "test-stats-output")
    try:
        ret = run_command(f"jade stats collect -i1 -o {output_dir} -d 1 -f")
        assert ret == 0
        cmd = f"jade stats show -o {output_dir} cpu disk mem net"
        output = {}
        ret = run_command(cmd, output=output)
        assert ret == 0
        for term in ("IOPS", "read_bytes", "bytes_recv", "idle"):
            assert term in output["stdout"]
    finally:
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)
Example #13
0
def run_jobs(config_file, distributed_submitter, output, num_processes,
             verbose):
    """Starts jobs on HPC."""
    match = re.search(r"batch_(\d+)\.json", config_file)
    assert match
    batch_id = match.group(1)
    os.makedirs(output, exist_ok=True)

    mgr = JobRunner(config_file, output=output, batch_id=batch_id)

    # Logging has to get enabled after the JobRunner is created because we need the node ID
    # is what makes the file unique.
    filename = os.path.join(output,
                            f"run_jobs_batch_{batch_id}_{mgr.node_id}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_event_logging(mgr.event_filename)
    logger = setup_logging(__name__,
                           filename,
                           file_level=level,
                           console_level=level,
                           mode="w")
    logger.info(get_cli_string())

    group = mgr.config.get_default_submission_group()
    if group.submitter_params.node_setup_script:
        cmd = f"{group.submitter_params.node_setup_script} {config_file} {output}"
        ret = run_command(cmd)
        if ret != 0:
            logger.error("Failed to run node setup script %s: %s", cmd, ret)
            sys.exit(ret)

    status = mgr.run_jobs(distributed_submitter=distributed_submitter,
                          verbose=verbose,
                          num_processes=num_processes)
    ret = status.value

    if group.submitter_params.node_shutdown_script:
        cmd = f"{group.submitter_params.node_shutdown_script} {config_file} {output}"
        ret2 = run_command(cmd)
        if ret2 != 0:
            logger.error("Failed to run node shutdown script %s: %s", cmd,
                         ret2)

    if status == Status.GOOD and distributed_submitter:
        start = time.time()
        _try_submit_jobs(output, verbose=verbose)
        logger.info("try-submit-jobs took %s seconds", time.time() - start)

    sys.exit(ret)
Example #14
0
    def _submit(self, verbose):
        for stage in self._stages:
            os.environ["JADE_PIPELINE_STAGE_ID"] = str(self._cur_stage_id)
            stage_info = {
                "stage_id":
                self._cur_stage_id,
                "output_directory":
                self.get_stage_output_path(self._output, self._cur_stage_id)
            }
            self._status_info["stages"].append(stage_info)
            self._dump_status()
            self._run_auto_config(stage)
            cmd = self._make_submit_cmd(stage, verbose)
            start = time.time()
            ret = run_command(cmd)
            end = time.time()
            exec_time = end - start
            result = Result(str(self._cur_stage_id), ret, "finished",
                            exec_time, end)
            self._status_info["stages"][-1]["result"] = serialize_result(
                result)
            self._dump_status()
            if ret != 0:
                raise ExecutionError(f"stage {self._cur_stage_id} failed")
            self._cur_stage_id += 1

        logger.info("Finished execution pipeline")
Example #15
0
    def generate_reports(directory):
        """Create reports summarizing the output results of a set of jobs.

        Parameters
        ----------
        directory : str
            output directory

        """
        commands = (
            (f"jade show-results -o {directory}", "results.txt"),
            (f"jade show-events -o {directory} --categories Error",
             "errors.txt"),
            (f"jade stats show -o {directory}", "stats.txt"),
        )

        reports = []
        for cmd in commands:
            output = {}
            ret = run_command(cmd[0], output=output)
            if ret != 0:
                return ret

            filename = os.path.join(directory, cmd[1])
            with open(filename, "w") as f_out:
                f_out.write(cmd[0] + "\n\n")
                f_out.write(output["stdout"])
                reports.append(filename)

        logger.info("Generated reports %s.", " ".join(reports))
        return 0
Example #16
0
    def check_status(self, name=None, job_id=None):
        field_names = ("jobid", "name", "state")
        cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h"
        if name is not None:
            cmd += f" -n {name}"
        elif job_id is not None:
            cmd += f" -j {job_id}"
        else:
            # Mutual exclusivity should be handled in HpcManager.
            assert False

        output = {}
        ret = run_command(cmd, output)
        if ret != 0:
            logger.error("Failed to run squeue command=[%s] ret=%s err=%s",
                         cmd, ret, output["stderr"])
            raise ExecutionError(f"squeue command failed: {ret}")

        stdout = output["stdout"]
        logger.debug("squeue output:  [%s]", stdout)
        fields = stdout.split()
        if not fields:
            # No jobs are currently running.
            return HpcJobInfo("", "", HpcJobStatus.NONE)

        assert len(fields) == len(field_names)
        job_info = HpcJobInfo(fields[0],
                              fields[1],
                              self._STATUSES.get(fields[2],
                                                 HpcJobStatus.UNKNOWN))
        return job_info
Example #17
0
def test_cancel_on_failure_detect_by_submitter(cleanup):
    # HpcSubmitter handles the cancellation because the blocked job will be in the 2nd batch.
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b2"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == 1
    assert len(summary.get_failed_results()) == 1
    assert len(summary.get_canceled_results()) == 6
    results = summary.get_results_by_type()
    assert len(results["successful"]) == 1
    assert len(results["failed"]) == 1
    assert len(results["canceled"]) == 6
Example #18
0
def test_run_generic_commands(generic_command_fixture):
    commands = [
        "ls .",
        "ls invalid-file-path",
    ]

    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    for job_param in inputs.iter_jobs():
        config.add_job(job_param)
    assert config.get_num_jobs() == 2
    config.dump(CONFIG_FILE)

    cmds = (
        f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1",
        # Test with higher queue depth. This exercises the code paths but
        # doesn't actually verify the functionality.
        # The infrastructure to do that is currently lacking. TODO
        f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -q 32",
    )

    for cmd in cmds:
        ret = run_command(cmd)
        assert ret == 0
def test_run_command__stdout():
    """Should run a command as a subprocess"""
    command = "echo 'Hello Disco'"
    output = {}
    ret = run_command(command, output)
    assert ret == 0
    assert "stdout" in output
    assert "Hello Disco" in output["stdout"]
Example #20
0
def test_cancel_on_failure_detect_by_runner(cleanup):
    # JobRunner handles the cancellation in JobQueue because the blocked job is in the batch
    # along with the blocking job.
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b8"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == 1
    assert len(summary.get_failed_results()) == 1
    assert len(summary.get_canceled_results()) == 6
    results = summary.get_results_by_type()
    assert len(results["successful"]) == 1
    assert len(results["failed"]) == 1
    assert len(results["canceled"]) == 6
Example #21
0
def test_stats__show(example_output):
    output = {}
    ret = run_command(f"jade stats show -o {example_output}", output)
    assert ret == 0
    assert len(output["stdout"]) > 0

    for text in ("Network stat", "Memory stat", "Disk stat", "CPU stat"):
        assert text in output["stdout"]
Example #22
0
def _try_submit_jobs(output, verbose):
    try_submit_cmd = f"jade try-submit-jobs {output}"
    if verbose:
        try_submit_cmd += " --verbose"
    ret = run_command(try_submit_cmd)
    if ret != 0:
        logger = logging.getLogger(__name__)
        logger.error("Failed to run '%s' ret=%s", try_submit_cmd, ret)
def test_run_command():
    """Should run a command as a subprocess"""
    command = "ls -l /dirnotexit"
    output = {}
    ret = run_command(command, output)
    assert ret != 0
    assert "stderr" in output
    assert "No such file or directory" in output["stderr"]
Example #24
0
    def _qstat(self):
        """Run the PBS qstat command and return the stdout split to rows.

        Returns
        -------
        qstat_rows : list | None
            List of strings where each string is a row in the qstat printout.
            Returns None if qstat is empty.

        """
        cmd = "qstat -u {user}".format(user=self.USER)
        output = {}
        run_command(cmd, output)
        if not output["stdout"]:
            # No jobs are currently running.
            return None

        qstat_rows = output["stdout"].split("\n")
        return qstat_rows
Example #25
0
    def submit(self, filename):
        output = {}
        ret = run_command("qsub {}".format(filename), output)
        if ret == 0:
            result = Status.GOOD
            job_id = output["stdout"]
        else:
            result = Status.ERROR
            job_id = None

        return result, job_id, output["stderr"]
Example #26
0
def test_stats__plot(example_output):
    path = os.path.join(example_output, "stats")
    try:
        ret = run_command(f"jade stats plot -o {example_output}")
        assert ret == 0
        for stat in ("Cpu", "Disk", "Memory", "Network"):
            filename = os.path.join(
                path, stat + "StatsViewer__resource_monitor_batch_0.html")
            assert os.path.exists(filename)
    finally:
        if os.path.exists(path):
            shutil.rmtree(path)
Example #27
0
def main():
    status = load_data(os.environ["JADE_PIPELINE_STATUS_FILE"])
    cur_stage = status["stages"][-1]
    cur_stage_output = cur_stage["output_directory"]
    previous_stage = status["stages"][-2]
    previous_stage_output = previous_stage["output_directory"]
    script = "jade/extensions/demo/merge_pred_gdp.py"

    with open(PRED_GDP_COMMANDS_FILE, "w") as f_out:
        cmd = f"python {script} run {previous_stage_output} {cur_stage_output}"
        f_out.write(cmd + "\n")

    cmd = "jade config create pred_gdp_commands.txt -c config-stage2.json"
    sys.exit(run_command(cmd))
Example #28
0
    def _run_command(self, cmd):
        orig = os.getcwd()
        os.chdir(self._path)

        try:
            output = {}
            ret = run_command(cmd, output=output)
            if ret != 0:
                raise ExecutionError(
                    f"[{cmd}] failed: {ret}: {output['stderr']}")

            return output["stdout"].strip()
        finally:
            os.chdir(orig)
Example #29
0
def main():
    config = PipelineConfig(
        **load_data(os.environ["JADE_PIPELINE_STATUS_FILE"]))
    cur_stage = config.stages[-1]
    cur_stage_output = cur_stage.path
    previous_stage = config.stages[-2]
    previous_stage_output = previous_stage.path
    script = "jade/extensions/demo/merge_pred_gdp.py"

    with open(PRED_GDP_COMMANDS_FILE, "w") as f_out:
        cmd = f"python {script} run {previous_stage_output} {cur_stage_output}"
        f_out.write(cmd + "\n")

    cmd = "jade config create pred_gdp_commands.txt -c config-stage2.json"
    sys.exit(run_command(cmd))
Example #30
0
def test_try_add_blocked_jobs(cleanup):
    num_commands = 5
    commands = ['echo "hello world"'] * num_commands
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    jobs = list(inputs.iter_jobs())
    for i, job_param in enumerate(jobs):
        if i == num_commands - 1:
            job_param.blocked_by = set([1, 2, 3, 4])
        config.add_job(job_param)
    config.dump(CONFIG_FILE)

    for option in ("--try-add-blocked-jobs", "--no-try-add-blocked-jobs"):
        cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} -p 0.1 {option}"
        ret = run_command(cmd)
        assert ret == 0
        ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
        assert ret == 0
        events_file = os.path.join(OUTPUT, "submit_jobs_events.log")
        events_summary = EventsSummary(OUTPUT, preload=True)
        submit_events = events_summary.list_events(EVENT_NAME_HPC_SUBMIT)
        if option == "--try-add-blocked-jobs":
            assert len(submit_events) == 1
            event = submit_events[0]
            assert event.data["batch_size"] == num_commands
            shutil.rmtree(OUTPUT)
        else:
            assert len(submit_events) == 2
            event1 = submit_events[0]
            event2 = submit_events[1]
            assert event1.data["batch_size"] == num_commands - 1
            assert event2.data["batch_size"] == 1