def _update_job_settings( self, category: str, checkpoint_path: Path, ckpt_name: Locator, depends_on, job: Job, job_name: Locator, job_profiles: Iterable[PegasusProfile], resource_request: ResourceRequest, times_to_retry_job: int, ) -> DependencyNode: """ Apply a variety of shared settings to a job. Centralized logic for multiple job types to use. """ self._job_graph.add_jobs(job) # Configure SLURM resource request resource_request.apply_to_job(job, job_name=self._job_name_for(job_name)) # Set the DAGMAN category to potentially limit the number of active jobs job.add_dagman_profile(category=category, retry=str(times_to_retry_job)) # Apply other user defined pegasus profiles for profile in job_profiles: job.add_profiles(profile.namespace, key=profile.key, value=profile.value) # Handle depedent job additions from the `depends_on` variable for parent_dependency in depends_on: if parent_dependency.job: self._job_graph.add_dependency(job, parents=[parent_dependency.job]) for out_file in parent_dependency.output_files: job.add_inputs(out_file) # Handle Output Files # This is currently only handled as the checkpoint file # See: https://github.com/isi-vista/vista-pegasus-wrapper/issues/25 # If the checkpoint file already exists, we want to add it to the replica catalog # so that we don't run the job corresponding to the checkpoint file again checkpoint_pegasus_file = self.create_file( f"{ckpt_name}", checkpoint_path, add_to_catalog=checkpoint_path.exists()) job.add_outputs(checkpoint_pegasus_file, stage_out=False) return DependencyNode.from_job(job, output_files=[checkpoint_pegasus_file])
def apply_to_job(self, job: Job, *, job_name: str) -> None: if not self.partition: raise RuntimeError("A partition to run on must be specified.") if self.partition.max_walltime < self.job_time_in_minutes: raise ValueError( f"Partition '{self.partition.name}' has a max walltime of {self.partition.max_walltime} mins, which is less than the time given ({self.job_time_in_minutes} mins) for job: {job_name}." ) slurm_resource_content = SLURM_RESOURCE_STRING.format( num_cpus=self.num_cpus or 1, num_gpus=self.num_gpus if self.num_gpus is not None else 0, job_name=job_name, mem_str=to_slurm_memory_string(self.memory or _SLURM_DEFAULT_MEMORY), ) if (self.exclude_list and self.run_on_single_node and self.run_on_single_node in self.exclude_list): raise ValueError( "the 'exclude_list' and 'run_on_single_node' options are not consistent." ) if self.exclude_list: slurm_resource_content += f" --exclude={self.exclude_list}" if self.run_on_single_node: slurm_resource_content += f" --nodelist={self.run_on_single_node}" if self.partition.name in (SCAVENGE, EPHEMERAL): slurm_resource_content += f" --qos={self.partition.name}" job.add_pegasus_profile( runtime=str(self.job_time_in_minutes * 60), queue=str(self.partition.name), project=_BORROWED_KEY if self.partition.name in (EPHEMERAL, SCAVENGE) else self.partition.name, glite_arguments=slurm_resource_content, ) if ("dagman" not in job.profiles.keys() or "CATEGORY" not in job.profiles["dagman"].keys()): job.add_dagman_profile(category=str(self.partition))