コード例 #1
0
 def cancel(self):
     self._return_code = 1
     self._is_complete = True
     result = Result(self._job.name, self._return_code,
                     JobCompletionStatus.CANCELED, 0.0)
     ResultsAggregator.append(self._output, result, batch_id=self._batch_id)
     logger.info("Canceled job %s", self._job.name)
コード例 #2
0
ファイル: dispatchable_job.py プロジェクト: daniel-thom/jade
    def _complete(self):
        ret = self._pipe.returncode
        exec_time_s = time.time() - self._start_time

        job_filename = self._job.name
        illegal_chars = ("/", "\\", ":")
        for char in illegal_chars:
            job_filename = job_filename.replace(char, "-")

        status = "finished"
        output_dir = os.path.join(self._output, JOBS_OUTPUT_DIR,
                                  self._job.name)
        bytes_consumed = get_directory_size_bytes(output_dir)
        event = StructuredLogEvent(
            source=self._job.name,
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="job output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)
        result = Result(self._job.name, ret, status, exec_time_s)
        ResultsAggregator.append(self._results_filename, result)

        logger.info("Job %s completed return_code=%s exec_time_s=%s",
                    self._job.name, ret, exec_time_s)
コード例 #3
0
ファイル: test_async_cli_command.py プロジェクト: NREL/jade
def async_cmd():
    """Async CLI command fixture"""
    job = mock.MagicMock()
    job.name = "Test-Job"
    cmd = "echo 'Hello World'"
    output = os.path.join(tempfile.gettempdir(), "jade-test-async-cli-job")
    os.makedirs(output, exist_ok=True)
    os.makedirs(os.path.join(output, RESULTS_DIR), exist_ok=True)
    ResultsAggregator.create(output)
    cmd = AsyncCliCommand(job, cmd, output, 1, True, "0")
    yield cmd
    shutil.rmtree(output)
コード例 #4
0
ファイル: async_cli_command.py プロジェクト: NREL/jade
 def cancel(self):
     self._return_code = 1
     self._is_complete = True
     if self._is_manager_node:
         result = Result(
             self._job.name,
             self._return_code,
             JobCompletionStatus.CANCELED,
             0.0,
             hpc_job_id=self._hpc_job_id,
         )
         ResultsAggregator.append(self._output, result, batch_id=self._batch_id)
         logger.info("Canceled job %s", self._job.name)
     else:
         logger.info("Canceled job %s on non-manager node", self._job.name)
コード例 #5
0
ファイル: hpc_submitter.py プロジェクト: NREL/jade
    def _update_completed_jobs(self):
        newly_completed = set()
        canceled_jobs = []
        # If jobs fail and are configured to cancel blocked jobs, we may need to run this
        # loop many times to cancel the entire chain.
        aggregator = ResultsAggregator.load(self._output)
        need_to_rerun = True
        new_results = []
        while need_to_rerun:
            need_to_rerun = False
            failed_jobs = set()
            for result in itertools.chain(aggregator.process_results(),
                                          new_results):
                newly_completed.add(result.name)
                if result.return_code != 0:
                    failed_jobs.add(result.name)
            new_results.clear()

            logger.debug("Detected completion of jobs: %s", newly_completed)
            logger.debug("Detected failed jobs: %s", failed_jobs)
            for job in self._cluster.iter_jobs(state=JobState.NOT_SUBMITTED):
                if job.blocked_by:
                    if job.cancel_on_blocking_job_failure and job.blocked_by.intersection(
                            failed_jobs):
                        result = self._cancel_job(job, aggregator)
                        canceled_jobs.append(job)
                        new_results.append(result)
                        need_to_rerun = True
                    else:
                        job.blocked_by.difference_update(newly_completed)

        return newly_completed, canceled_jobs
コード例 #6
0
ファイル: job_runner.py プロジェクト: daniel-thom/jade
    def _generate_jobs(self, config_file, verbose):
        job_exec_class = self._config.job_execution_class()
        results_filename = get_results_temp_filename(self._output,
                                                     self._batch_id)
        results_aggregator = ResultsAggregator(results_filename)
        results_aggregator.create_file()

        return [
            DispatchableJob(
                job,
                job_exec_class.generate_command(job,
                                                self._jobs_output,
                                                config_file,
                                                verbose=verbose), self._output,
                results_filename) for job in self._config.iter_jobs()
        ]
コード例 #7
0
def test_resubmit_missing(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    results.pop()
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = final_results["results"].pop()
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS - 1

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
コード例 #8
0
def test_resubmit_failed(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    x = results[0]
    results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time)
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    final_results["results"][0]["return_code"] = 1
    final_results["results_summary"]["num_failed"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert summary.get_failed_results()[0].name == "1"

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
コード例 #9
0
    def _handle_completion(self, cluster):
        result = Status.GOOD
        self._results = ResultsAggregator.list_results(self._output)
        if len(self._results) != self._config.get_num_jobs():
            finished_jobs = {x.name for x in self._results}
            all_jobs = {x.name for x in self._config.iter_jobs()}
            missing_jobs = sorted(all_jobs.difference(finished_jobs))
            logger.error(
                "Error in result totals. num_results=%s total_num_jobs=%s",
                len(self._results),
                self._config.get_num_jobs(),
            )
            logger.error(
                "These jobs did not finish: %s. Check for process crashes or HPC timeouts.",
                missing_jobs,
            )
            result = Status.ERROR
        else:
            missing_jobs = []

        self.write_results_summary(RESULTS_FILE, missing_jobs)
        self._log_error_log_messages(self._output)

        bytes_consumed = get_directory_size_bytes(self._output,
                                                  recursive=False)
        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="main output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)

        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_SUBMIT_COMPLETED,
            message="job submission completed",
            num_jobs=self.get_num_jobs(),
        )
        log_event(event)

        group = self._config.get_default_submission_group()
        if group.submitter_params.generate_reports:
            self.generate_reports(self._output,
                                  group.submitter_params.resource_monitor_type)
        cluster.mark_complete()

        if cluster.config.pipeline_stage_num is not None:
            # The pipeline directory must be the one above this one.
            pipeline_dir = os.path.dirname(self._output)
            next_stage = cluster.config.pipeline_stage_num + 1
            cmd = (f"jade pipeline submit-next-stage {pipeline_dir} "
                   f"--stage-num={next_stage} "
                   f"--return-code={result.value}")
            run_command(cmd)

        return result
コード例 #10
0
def test_results_aggregator(cleanup):
    """Test ResultsAggregator"""
    if os.path.exists(OUTPUT):
        shutil.rmtree(OUTPUT)
    os.makedirs(os.path.join(OUTPUT, RESULTS_DIR))

    results = [create_result(i) for i in range(100)]
    batch1_file = get_results_temp_filename(OUTPUT, 1)
    batch2_file = get_results_temp_filename(OUTPUT, 2)
    pytest.aggregator1 = ResultsAggregator(batch1_file)
    pytest.aggregator2 = ResultsAggregator(batch2_file)
    pytest.aggregator1.create_file()
    pytest.aggregator2.create_file()
    assert os.path.exists(pytest.aggregator1._filename)
    assert os.path.exists(pytest.aggregator2._filename)

    with ProcessPoolExecutor() as executor:
        executor.map(append, results)

    final_results1 = pytest.aggregator1.get_results()
    final_results1.sort(key=lambda x: int(x.name))
    final_results2 = pytest.aggregator2.get_results()
    final_results2.sort(key=lambda x: int(x.name))

    expected1 = [x for x in results if int(x.name) % 2 == 0]
    expected2 = [x for x in results if int(x.name) % 2 != 0]

    assert final_results1 == expected1
    assert final_results2 == expected2

    results_dir = os.path.join(OUTPUT, RESULTS_DIR)
    summary = ResultsAggregatorSummary(results_dir)
    final_results = summary.get_results()
    final_results.sort(key=lambda x: int(x.name))
    assert final_results == results

    summary.delete_files()
    assert not [x for x in os.listdir(results_dir) if x.endswith(".csv")]
コード例 #11
0
ファイル: async_cli_command.py プロジェクト: NREL/jade
    def _complete(self):
        self._return_code = self._pipe.returncode
        exec_time_s = time.time() - self._start_time

        if not self._is_manager_node:
            # This will happen on a multi-node job. Don't complete it multiple times.
            logger.info(
                "Job %s completed on non-manager node return_code=%s exec_time_s=%s",
                self._job.name,
                self._return_code,
                exec_time_s,
            )
            return

        status = JobCompletionStatus.FINISHED
        output_dir = self._output / JOBS_OUTPUT_DIR / self._job.name
        bytes_consumed = get_directory_size_bytes(output_dir)
        event = StructuredLogEvent(
            source=self._job.name,
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="job output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)
        result = Result(
            self._job.name, self._return_code, status, exec_time_s, hpc_job_id=self._hpc_job_id
        )
        ResultsAggregator.append(self._output, result, batch_id=self._batch_id)

        logger.info(
            "Job %s completed return_code=%s exec_time_s=%s hpc_job_id=%s",
            self._job.name,
            self._return_code,
            exec_time_s,
            self._hpc_job_id,
        )
コード例 #12
0
def test_results_aggregator(cleanup):
    """Test ResultsAggregator"""
    if os.path.exists(OUTPUT):
        shutil.rmtree(OUTPUT)

    results = [create_result(i) for i in range(100)]
    os.makedirs(OUTPUT)
    pytest.aggregator = ResultsAggregator.create(OUTPUT)
    assert os.path.exists(pytest.aggregator._filename)

    for result in results:
        if int(result.name) % 2 == 0:
            pytest.aggregator.append_result(result)

    final_results = pytest.aggregator.get_results()
    final_results.sort(key=lambda x: int(x.name))
    expected = [x for x in results if int(x.name) % 2 == 0]
    assert final_results == expected
コード例 #13
0
    def submit_jobs(self, cluster, force_local=False):
        """Submit simulations. Auto-detect whether the current system is an HPC
        and submit to its queue. Otherwise, run locally.

        Parameters
        ----------
        cluster : Cluster
        force_local : bool
            If on HPC, run jobs through subprocess as if local.

        Returns
        -------
        Status

        """
        if self._is_new:
            logger.info("Submit %s jobs for execution.",
                        self._config.get_num_jobs())
            logger.info("JADE version %s", jade.version.__version__)
            registry = Registry()
            loggers = registry.list_loggers()
            logger.info("Registered modules for logging: %s",
                        ", ".join(loggers))
            self._save_repository_info(registry)

            ResultsAggregator.create(self._output)

            # If an events summary file exists, it is invalid.
            events_file = os.path.join(self._output, EVENTS_FILENAME)
            if os.path.exists(events_file):
                os.remove(events_file)

            event = StructuredLogEvent(
                source="submitter",
                category=EVENT_CATEGORY_RESOURCE_UTIL,
                name=EVENT_NAME_SUBMIT_COMPLETED,
                message="job submission started",
                num_jobs=self.get_num_jobs(),
            )
            log_event(event)

            os.environ["JADE_RUNTIME_OUTPUT"] = self._output
            if self._config.setup_command is not None:
                cmd = f"JADE_RUNTIME_OUTPUT={self._output} {self._config.setup_command}"
                logger.info("Running setup command: %s", cmd)
                check_run_command(self._config.setup_command)
        else:
            self._handle_submission_groups()

        result = Status.IN_PROGRESS
        group = self._config.get_default_submission_group()
        groups = make_submission_group_lookup(cluster.config.submission_groups)
        self._hpc = HpcManager(groups, self._output)

        if self._hpc.hpc_type == HpcType.LOCAL or force_local:
            runner = JobRunner(self._config_file, output=self._output)
            num_processes = group.submitter_params.num_processes
            verbose = group.submitter_params.verbose
            result = runner.run_jobs(verbose=verbose,
                                     num_processes=num_processes)
            agg = ResultsAggregator.load(self._output)
            agg.process_results()
            is_complete = True
        else:
            is_complete = self._submit_to_hpc(cluster)

        if is_complete:
            result = self._handle_completion(cluster)

        return result
コード例 #14
0
ファイル: resubmit_jobs.py プロジェクト: jgu2/jade
def _reset_results(output, jobs_to_resubmit):
    aggregator = ResultsAggregator.load(output)
    aggregator.clear_results_for_resubmission(jobs_to_resubmit)
コード例 #15
0
def test_resubmit_with_blocking_jobs(basic_setup):
    num_commands = 7
    commands = ['echo "hello world"'] * num_commands
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    jobs = list(inputs.iter_jobs())
    # Set an inefficient ordering to make sure the resubmit algorithm is recursive.
    for i, job_param in enumerate(jobs):
        if i == 3:
            job_param.blocked_by = set([5])
        elif i == 4:
            job_param.blocked_by = set([7])
        elif i == 6:
            job_param.blocked_by = set([6])
        config.add_job(job_param)
    config.dump(CONFIG_FILE)
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    found = False
    for i, result in enumerate(results):
        if result.name == "7":
            results.pop(i)
            found = True
            break
    assert found
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = None
    for i, result in enumerate(final_results["results"]):
        if result["name"] == "7":
            missing = result
            final_results["results"].pop(i)
            break
    assert missing is not None
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == num_commands - 1
    first_batch = load_data(Path(OUTPUT) / "config_batch_1.json")
    assert len(first_batch["jobs"]) == num_commands

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == num_commands

    second_batch_file = Path(OUTPUT) / "config_batch_2.json"
    assert second_batch_file.exists()
    second_batch = load_data(second_batch_file)["jobs"]
    assert len(second_batch) == 3