Exemple #1
0
    def _run_job(self, job: Job, params_it: Iterable[ParamSet]) -> Iterator[Tuple[ParamSet, Result]]:
        # TODO: introduce public API of Process to get working_directory
        if job.process._working_directory is not None:
            raise Exception('HTCondor does not support setting the working_directory on Process')

        with ExitStack() as stack:
            cluster_generator = _JobClusterGenerator(self, job, params_it)
            stack.enter_context(cluster_generator)

            submit = Submit()

            with self._schedd.transaction() as txn:
                submit_result = submit.queue_with_itemdata(txn, itemdata=iter(cluster_generator))

            stack.callback(
                self._schedd.act, JobAction.Remove, f'ClusterId == {submit_result.cluster()}',
            )

            job_states: Dict[int, _JobState] = {}

            for sleep_time in _get_poll_sleep_times():
                sleep(sleep_time)

                query_result = self._schedd.xquery(
                    requirements = f'ClusterId == {submit_result.cluster()}',
                    projection = _JobState.projection(),
                )

                job_states.clear()
                for job_state_ad in query_result:
                    job_state = _JobState.from_class_ad(job_state_ad)
                    job_states[job_state.proc_id] = job_state

                counts = _StatusCounts()
                counts.add_jobs(job_states.values())

                print(counts)

                if counts.completed == counts.total:
                    break

            results: List[Tuple[ParamSet, Result]] = []
            for proc_id, process in enumerate(cluster_generator.processes):
                job_state = job_states[proc_id]
                if job_state.exit_by_signal:
                    raise _ProcessFailedError(
                        f'Process exited due to receiving signal {job_state.exit_signal}',
                    )
                if job_state.exit_code is None:
                    raise Exception('Exit code received from HTCondor is None')

                result = process.result(job_state.exit_code)
                self._check_for_failure(job, result, process.params)
                results.append((process.params, result))

            self._cleanup_handlers += cluster_generator.cleanup_handlers

        return iter(results)
Exemple #2
0
def execute_submit(submit_object: htcondor.Submit, itemdata: List[Dict[str, str]]) -> int:
    """
    Execute a map via the scheduler defined by the settings.
    Return the HTCondor cluster ID of the map's jobs.
    """
    schedd = get_schedd()
    with schedd.transaction() as txn:
        submit_result = submit_object.queue_with_itemdata(
            txn,
            1,
            iter(itemdata),
        )

        return submit_result.cluster()