def _make_singularity_command(self): params = self._submission_group.submitter_params.singularity_params if not params.run_command: raise InvalidConfiguration("Singularity command cannot be empty.") container_path = Path(params.container) cmd = f"{params.run_command} {container_path} {self._run_script}" sing_script = self._run_script.parent / self._run_script.name.replace( "run", "singularity", 1) text = "#!/bin/bash\n" + params.load_command + "\n" + cmd + "\n" create_script(str(sing_script), text) return sing_script
def __init__(self, output_dir): self._output_dir = output_dir self._results_file = os.path.join(output_dir, RESULTS_FILE) if not os.path.exists(self._results_file): raise InvalidConfiguration( f"There is no results file in {output_dir}") data = self._parse(self._results_file) data["results"] = deserialize_results(data["results"]) self._results = data self._missing_jobs = data["missing_jobs"] self._base_directory = data["base_directory"]
def check_job_dependencies(self, submitter_params): """Check for impossible conditions with job dependencies. Parameters ---------- submitter_params : SubmitterParams Raises ------ InvalidConfiguration Raised if job dependencies have an impossible condition. """ requires_estimated_time = submitter_params.per_node_batch_size == 0 # This currently only checks that all jobs defined as blocking exist. # It does not look for deadlocks. job_names = set() blocking_jobs = set() missing_estimate = [] for job in self.iter_jobs(): job_names.add(job.name) blocking_jobs.update(job.get_blocking_jobs()) if requires_estimated_time and job.estimated_run_minutes is None: missing_estimate.append(job.name) missing_jobs = blocking_jobs.difference(job_names) if missing_jobs: for job in missing_jobs: logger.error("%s is blocking a job but does not exist", job) raise InvalidConfiguration("job ordering definitions are invalid") if missing_estimate: for job in missing_estimate: logger.error("Job %s does not define estimated_run_minutes", job) raise InvalidConfiguration( "Submitting batches by time requires that each job define estimated_run_minutes" )
def get_config_exec_time(self): """Return the total number of seconds to run all jobs in the config. Returns ------- int """ events = self.list_events(EVENT_NAME_CONFIG_EXEC_SUMMARY) if not events: raise InvalidConfiguration("no batch summary events found") return events[0].data["config_execution_time"]
def add_job(self, job): # Overrides JobConfiguration.add_job so that it can add a unique # identifier to each job. # This will not be true at deserialization. if job.job_id is None: job.job_id = self._cur_job_id self._cur_job_id += 1 if not job.command: raise InvalidConfiguration( f"command cannot be emtpy: job_id={job.job_id}") self._jobs.add_job(job)
def _deserialize(cls, path, try_promote_to_submitter=False, deserialize_jobs=False): config_file = cls.get_config_file(path) if not os.path.isfile(config_file): raise InvalidConfiguration(f"{config_file} does not exist") config = ClusterConfig(**load_data(config_file)) cluster = cls(config) promoted = False if try_promote_to_submitter: promoted = cluster._promote_to_submitter() if deserialize_jobs: cluster._deserialize_jobs() return cluster, promoted
def check_job_runtimes(self): """Check for any job with a longer estimated runtime than the walltime. Raises ------ InvalidConfiguration Raised if any job is too long. """ wall_times = { x.name: x.submitter_params.get_wall_time() for x in self.submission_groups } for job in self.iter_jobs(): wall_time = wall_times[job.submission_group] if job.estimated_run_minutes is not None: estimate = timedelta(minutes=job.estimated_run_minutes) if estimate > wall_time: raise InvalidConfiguration( f"job {job.name} has estimated_run_minutes={estimate} longer than wall_time={wall_time}" )
def check_job_dependencies(self): """Check for impossible conditions with job dependencies. Raises ------ InvalidConfiguration Raised if job dependencies have an impossible condition. """ # This currently only checks that all jobs defined as blocking exist. # It does not look for deadlocks. job_names = set() blocking_jobs = set() for job in self.iter_jobs(): job_names.add(job.name) blocking_jobs.update(job.get_blocking_jobs()) missing_jobs = blocking_jobs.difference(job_names) if missing_jobs: for job in missing_jobs: logger.error("%s is blocking a job but does not exist", job) raise InvalidConfiguration("job ordering definitions are invalid")
def check_submission_groups(self, submitter_params): """Check for invalid job submission group assignments. Make a default group if none are defined and assign it to each job. Parameters ---------- submitter_params : SubmitterParams Raises ------ InvalidConfiguration Raised if submission group assignments are invalid. """ groups = self.submission_groups if not groups: self._assign_default_submission_group(submitter_params) return first_group = next(iter(groups)) group_params = ( "try_add_blocked_jobs", "time_based_batching", "num_processes", "hpc_config", "per_node_batch_size", "singularity_params", "distributed_submitter", ) user_overrides = ( "distributed_submitter", "generate_reports", "resource_monitor_interval", "resource_monitor_type", "dry_run", "verbose", ) user_override_if_not_set = ("node_setup_script", "node_shutdown_script") must_be_same = ("max_nodes", "poll_interval") all_params = (must_be_same, group_params, user_overrides, user_override_if_not_set) fields = {item for params in all_params for item in params} assert sorted(list(fields)) == sorted( SubmitterParams.__fields__), sorted(list(fields)) hpc_type = first_group.submitter_params.hpc_config.hpc_type group_names = set() for group in groups: if group.name in group_names: raise InvalidConfiguration( f"submission group {group.name} is listed twice") group_names.add(group.name) if group.submitter_params.hpc_config.hpc_type != hpc_type: raise InvalidConfiguration( f"hpc_type values must be the same in all groups") for param in must_be_same: first_val = getattr(first_group.submitter_params, param) this_val = getattr(group.submitter_params, param) if this_val != first_val: raise InvalidConfiguration( f"{param} must be the same in all groups") for param in user_overrides: user_val = getattr(submitter_params, param) setattr(group.submitter_params, param, user_val) for param in user_override_if_not_set: user_val = getattr(submitter_params, param) group_val = getattr(group.submitter_params, param) if group_val is None: setattr(group.submitter_params, param, user_val) jobs_by_group = defaultdict(list) for job in self.iter_jobs(): if job.submission_group is None: raise InvalidConfiguration( f"Job {job.name} does not have a submission group assigned" ) if job.submission_group not in group_names: raise InvalidConfiguration( f"Job {job.name} has an invalid submission group: {job.submission_group}" ) jobs_by_group[job.submission_group].append(job.name) group_counts = {} for name, jobs in jobs_by_group.items(): if not jobs: logger.warning( "Submission group %s does not have any jobs defined", name) group_counts[name] = len(jobs) for name, count in sorted(group_counts.items()): logger.info("Submission group %s has %s jobs", name, count)
def add_job(self, job): if job.name in self._jobs: raise InvalidConfiguration( f"job name {job.name} is already stored") self._jobs[job.name] = job logger.debug("Added job %s", job.name)