Example #1
0
    def _update_completed_jobs(self):
        newly_completed = set()
        canceled_jobs = []
        # If jobs fail and are configured to cancel blocked jobs, we may need to run this
        # loop many times to cancel the entire chain.
        aggregator = ResultsAggregator.load(self._output)
        need_to_rerun = True
        new_results = []
        while need_to_rerun:
            need_to_rerun = False
            failed_jobs = set()
            for result in itertools.chain(aggregator.process_results(),
                                          new_results):
                newly_completed.add(result.name)
                if result.return_code != 0:
                    failed_jobs.add(result.name)
            new_results.clear()

            logger.debug("Detected completion of jobs: %s", newly_completed)
            logger.debug("Detected failed jobs: %s", failed_jobs)
            for job in self._cluster.iter_jobs(state=JobState.NOT_SUBMITTED):
                if job.blocked_by:
                    if job.cancel_on_blocking_job_failure and job.blocked_by.intersection(
                            failed_jobs):
                        result = self._cancel_job(job, aggregator)
                        canceled_jobs.append(job)
                        new_results.append(result)
                        need_to_rerun = True
                    else:
                        job.blocked_by.difference_update(newly_completed)

        return newly_completed, canceled_jobs
Example #2
0
def test_resubmit_missing(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    results.pop()
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = final_results["results"].pop()
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == NUM_COMMANDS - 1

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
Example #3
0
def test_resubmit_failed(cleanup):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    x = results[0]
    results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time)
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    final_results["results"][0]["return_code"] = 1
    final_results["results_summary"]["num_failed"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert summary.get_failed_results()[0].name == "1"

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == NUM_COMMANDS
Example #4
0
    def submit_jobs(self, cluster, force_local=False):
        """Submit simulations. Auto-detect whether the current system is an HPC
        and submit to its queue. Otherwise, run locally.

        Parameters
        ----------
        cluster : Cluster
        force_local : bool
            If on HPC, run jobs through subprocess as if local.

        Returns
        -------
        Status

        """
        if self._is_new:
            logger.info("Submit %s jobs for execution.",
                        self._config.get_num_jobs())
            logger.info("JADE version %s", jade.version.__version__)
            registry = Registry()
            loggers = registry.list_loggers()
            logger.info("Registered modules for logging: %s",
                        ", ".join(loggers))
            self._save_repository_info(registry)

            ResultsAggregator.create(self._output)

            # If an events summary file exists, it is invalid.
            events_file = os.path.join(self._output, EVENTS_FILENAME)
            if os.path.exists(events_file):
                os.remove(events_file)

            event = StructuredLogEvent(
                source="submitter",
                category=EVENT_CATEGORY_RESOURCE_UTIL,
                name=EVENT_NAME_SUBMIT_COMPLETED,
                message="job submission started",
                num_jobs=self.get_num_jobs(),
            )
            log_event(event)

            os.environ["JADE_RUNTIME_OUTPUT"] = self._output
            if self._config.setup_command is not None:
                cmd = f"JADE_RUNTIME_OUTPUT={self._output} {self._config.setup_command}"
                logger.info("Running setup command: %s", cmd)
                check_run_command(self._config.setup_command)
        else:
            self._handle_submission_groups()

        result = Status.IN_PROGRESS
        group = self._config.get_default_submission_group()
        groups = make_submission_group_lookup(cluster.config.submission_groups)
        self._hpc = HpcManager(groups, self._output)

        if self._hpc.hpc_type == HpcType.LOCAL or force_local:
            runner = JobRunner(self._config_file, output=self._output)
            num_processes = group.submitter_params.num_processes
            verbose = group.submitter_params.verbose
            result = runner.run_jobs(verbose=verbose,
                                     num_processes=num_processes)
            agg = ResultsAggregator.load(self._output)
            agg.process_results()
            is_complete = True
        else:
            is_complete = self._submit_to_hpc(cluster)

        if is_complete:
            result = self._handle_completion(cluster)

        return result
Example #5
0
def _reset_results(output, jobs_to_resubmit):
    aggregator = ResultsAggregator.load(output)
    aggregator.clear_results_for_resubmission(jobs_to_resubmit)
Example #6
0
def test_resubmit_with_blocking_jobs(basic_setup):
    num_commands = 7
    commands = ['echo "hello world"'] * num_commands
    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration(job_inputs=inputs)
    jobs = list(inputs.iter_jobs())
    # Set an inefficient ordering to make sure the resubmit algorithm is recursive.
    for i, job_param in enumerate(jobs):
        if i == 3:
            job_param.blocked_by = set([5])
        elif i == 4:
            job_param.blocked_by = set([7])
        elif i == 6:
            job_param.blocked_by = set([6])
        config.add_job(job_param)
    config.dump(CONFIG_FILE)
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    ret = run_command(cmd)
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    agg = ResultsAggregator.load(OUTPUT)
    results = agg.get_results_unsafe()
    assert results
    for result in results:
        assert result.return_code == 0
    found = False
    for i, result in enumerate(results):
        if result.name == "7":
            results.pop(i)
            found = True
            break
    assert found
    agg._write_results(results)

    results_filename = os.path.join(OUTPUT, RESULTS_FILE)
    final_results = load_data(results_filename)
    missing = None
    for i, result in enumerate(final_results["results"]):
        if result["name"] == "7":
            missing = result
            final_results["results"].pop(i)
            break
    assert missing is not None
    final_results["results_summary"]["num_missing"] = 1
    final_results["results_summary"]["num_successful"] -= 1
    final_results["missing_jobs"] = [missing["name"]]
    dump_data(final_results, results_filename)

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_failed_results()) == 0
    assert len(summary.get_successful_results()) == num_commands - 1
    first_batch = load_data(Path(OUTPUT) / "config_batch_1.json")
    assert len(first_batch["jobs"]) == num_commands

    ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}")
    assert ret == 0
    ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01")
    assert ret == 0

    summary = ResultsSummary(OUTPUT)
    assert len(summary.get_successful_results()) == num_commands

    second_batch_file = Path(OUTPUT) / "config_batch_2.json"
    assert second_batch_file.exists()
    second_batch = load_data(second_batch_file)["jobs"]
    assert len(second_batch) == 3