Beispiel #1
0
 def _make_singularity_command(self):
     params = self._submission_group.submitter_params.singularity_params
     if not params.run_command:
         raise InvalidConfiguration("Singularity command cannot be empty.")
     container_path = Path(params.container)
     cmd = f"{params.run_command} {container_path} {self._run_script}"
     sing_script = self._run_script.parent / self._run_script.name.replace(
         "run", "singularity", 1)
     text = "#!/bin/bash\n" + params.load_command + "\n" + cmd + "\n"
     create_script(str(sing_script), text)
     return sing_script
Beispiel #2
0
    def __init__(self, output_dir):
        self._output_dir = output_dir
        self._results_file = os.path.join(output_dir, RESULTS_FILE)
        if not os.path.exists(self._results_file):
            raise InvalidConfiguration(
                f"There is no results file in {output_dir}")

        data = self._parse(self._results_file)
        data["results"] = deserialize_results(data["results"])
        self._results = data
        self._missing_jobs = data["missing_jobs"]
        self._base_directory = data["base_directory"]
Beispiel #3
0
    def check_job_dependencies(self, submitter_params):
        """Check for impossible conditions with job dependencies.

        Parameters
        ----------
        submitter_params : SubmitterParams

        Raises
        ------
        InvalidConfiguration
            Raised if job dependencies have an impossible condition.

        """
        requires_estimated_time = submitter_params.per_node_batch_size == 0

        # This currently only checks that all jobs defined as blocking exist.
        # It does not look for deadlocks.

        job_names = set()
        blocking_jobs = set()
        missing_estimate = []
        for job in self.iter_jobs():
            job_names.add(job.name)
            blocking_jobs.update(job.get_blocking_jobs())
            if requires_estimated_time and job.estimated_run_minutes is None:
                missing_estimate.append(job.name)

        missing_jobs = blocking_jobs.difference(job_names)
        if missing_jobs:
            for job in missing_jobs:
                logger.error("%s is blocking a job but does not exist", job)
            raise InvalidConfiguration("job ordering definitions are invalid")

        if missing_estimate:
            for job in missing_estimate:
                logger.error("Job %s does not define estimated_run_minutes",
                             job)
            raise InvalidConfiguration(
                "Submitting batches by time requires that each job define estimated_run_minutes"
            )
Beispiel #4
0
    def get_config_exec_time(self):
        """Return the total number of seconds to run all jobs in the config.

        Returns
        -------
        int

        """
        events = self.list_events(EVENT_NAME_CONFIG_EXEC_SUMMARY)
        if not events:
            raise InvalidConfiguration("no batch summary events found")

        return events[0].data["config_execution_time"]
Beispiel #5
0
    def add_job(self, job):
        # Overrides JobConfiguration.add_job so that it can add a unique
        # identifier to each job.

        # This will not be true at deserialization.
        if job.job_id is None:
            job.job_id = self._cur_job_id
            self._cur_job_id += 1

        if not job.command:
            raise InvalidConfiguration(
                f"command cannot be emtpy: job_id={job.job_id}")
        self._jobs.add_job(job)
Beispiel #6
0
    def _deserialize(cls,
                     path,
                     try_promote_to_submitter=False,
                     deserialize_jobs=False):
        config_file = cls.get_config_file(path)
        if not os.path.isfile(config_file):
            raise InvalidConfiguration(f"{config_file} does not exist")

        config = ClusterConfig(**load_data(config_file))
        cluster = cls(config)
        promoted = False
        if try_promote_to_submitter:
            promoted = cluster._promote_to_submitter()
        if deserialize_jobs:
            cluster._deserialize_jobs()

        return cluster, promoted
Beispiel #7
0
    def check_job_runtimes(self):
        """Check for any job with a longer estimated runtime than the walltime.

        Raises
        ------
        InvalidConfiguration
            Raised if any job is too long.

        """
        wall_times = {
            x.name: x.submitter_params.get_wall_time()
            for x in self.submission_groups
        }
        for job in self.iter_jobs():
            wall_time = wall_times[job.submission_group]
            if job.estimated_run_minutes is not None:
                estimate = timedelta(minutes=job.estimated_run_minutes)
                if estimate > wall_time:
                    raise InvalidConfiguration(
                        f"job {job.name} has estimated_run_minutes={estimate} longer than wall_time={wall_time}"
                    )
Beispiel #8
0
    def check_job_dependencies(self):
        """Check for impossible conditions with job dependencies.

        Raises
        ------
        InvalidConfiguration
            Raised if job dependencies have an impossible condition.

        """
        # This currently only checks that all jobs defined as blocking exist.
        # It does not look for deadlocks.

        job_names = set()
        blocking_jobs = set()
        for job in self.iter_jobs():
            job_names.add(job.name)
            blocking_jobs.update(job.get_blocking_jobs())

        missing_jobs = blocking_jobs.difference(job_names)
        if missing_jobs:
            for job in missing_jobs:
                logger.error("%s is blocking a job but does not exist", job)
            raise InvalidConfiguration("job ordering definitions are invalid")
Beispiel #9
0
    def check_submission_groups(self, submitter_params):
        """Check for invalid job submission group assignments.
        Make a default group if none are defined and assign it to each job.

        Parameters
        ----------
        submitter_params : SubmitterParams

        Raises
        ------
        InvalidConfiguration
            Raised if submission group assignments are invalid.

        """
        groups = self.submission_groups
        if not groups:
            self._assign_default_submission_group(submitter_params)
            return

        first_group = next(iter(groups))
        group_params = (
            "try_add_blocked_jobs",
            "time_based_batching",
            "num_processes",
            "hpc_config",
            "per_node_batch_size",
            "singularity_params",
            "distributed_submitter",
        )
        user_overrides = (
            "distributed_submitter",
            "generate_reports",
            "resource_monitor_interval",
            "resource_monitor_type",
            "dry_run",
            "verbose",
        )
        user_override_if_not_set = ("node_setup_script",
                                    "node_shutdown_script")
        must_be_same = ("max_nodes", "poll_interval")
        all_params = (must_be_same, group_params, user_overrides,
                      user_override_if_not_set)
        fields = {item for params in all_params for item in params}
        assert sorted(list(fields)) == sorted(
            SubmitterParams.__fields__), sorted(list(fields))
        hpc_type = first_group.submitter_params.hpc_config.hpc_type
        group_names = set()
        for group in groups:
            if group.name in group_names:
                raise InvalidConfiguration(
                    f"submission group {group.name} is listed twice")
            group_names.add(group.name)
            if group.submitter_params.hpc_config.hpc_type != hpc_type:
                raise InvalidConfiguration(
                    f"hpc_type values must be the same in all groups")
            for param in must_be_same:
                first_val = getattr(first_group.submitter_params, param)
                this_val = getattr(group.submitter_params, param)
                if this_val != first_val:
                    raise InvalidConfiguration(
                        f"{param} must be the same in all groups")
            for param in user_overrides:
                user_val = getattr(submitter_params, param)
                setattr(group.submitter_params, param, user_val)
            for param in user_override_if_not_set:
                user_val = getattr(submitter_params, param)
                group_val = getattr(group.submitter_params, param)
                if group_val is None:
                    setattr(group.submitter_params, param, user_val)

        jobs_by_group = defaultdict(list)
        for job in self.iter_jobs():
            if job.submission_group is None:
                raise InvalidConfiguration(
                    f"Job {job.name} does not have a submission group assigned"
                )
            if job.submission_group not in group_names:
                raise InvalidConfiguration(
                    f"Job {job.name} has an invalid submission group: {job.submission_group}"
                )
            jobs_by_group[job.submission_group].append(job.name)

        group_counts = {}
        for name, jobs in jobs_by_group.items():
            if not jobs:
                logger.warning(
                    "Submission group %s does not have any jobs defined", name)
            group_counts[name] = len(jobs)

        for name, count in sorted(group_counts.items()):
            logger.info("Submission group %s has %s jobs", name, count)
Beispiel #10
0
 def add_job(self, job):
     if job.name in self._jobs:
         raise InvalidConfiguration(
             f"job name {job.name} is already stored")
     self._jobs[job.name] = job
     logger.debug("Added job %s", job.name)