def _update_job_settings( self, category: str, checkpoint_path: Path, ckpt_name: Locator, depends_on, job: Job, job_name: Locator, job_profiles: Iterable[PegasusProfile], resource_request: ResourceRequest, times_to_retry_job: int, ) -> DependencyNode: """ Apply a variety of shared settings to a job. Centralized logic for multiple job types to use. """ self._job_graph.add_jobs(job) # Configure SLURM resource request resource_request.apply_to_job(job, job_name=self._job_name_for(job_name)) # Set the DAGMAN category to potentially limit the number of active jobs job.add_dagman_profile(category=category, retry=str(times_to_retry_job)) # Apply other user defined pegasus profiles for profile in job_profiles: job.add_profiles(profile.namespace, key=profile.key, value=profile.value) # Handle depedent job additions from the `depends_on` variable for parent_dependency in depends_on: if parent_dependency.job: self._job_graph.add_dependency(job, parents=[parent_dependency.job]) for out_file in parent_dependency.output_files: job.add_inputs(out_file) # Handle Output Files # This is currently only handled as the checkpoint file # See: https://github.com/isi-vista/vista-pegasus-wrapper/issues/25 # If the checkpoint file already exists, we want to add it to the replica catalog # so that we don't run the job corresponding to the checkpoint file again checkpoint_pegasus_file = self.create_file( f"{ckpt_name}", checkpoint_path, add_to_catalog=checkpoint_path.exists()) job.add_outputs(checkpoint_pegasus_file, stage_out=False) return DependencyNode.from_job(job, output_files=[checkpoint_pegasus_file])
def apply_to_job(self, job: Job, *, job_name: str) -> None: if not self.partition: raise RuntimeError("A partition to run on must be specified.") if self.partition.max_walltime < self.job_time_in_minutes: raise ValueError( f"Partition '{self.partition.name}' has a max walltime of {self.partition.max_walltime} mins, which is less than the time given ({self.job_time_in_minutes} mins) for job: {job_name}." ) slurm_resource_content = SLURM_RESOURCE_STRING.format( num_cpus=self.num_cpus or 1, num_gpus=self.num_gpus if self.num_gpus is not None else 0, job_name=job_name, mem_str=to_slurm_memory_string(self.memory or _SLURM_DEFAULT_MEMORY), ) if (self.exclude_list and self.run_on_single_node and self.run_on_single_node in self.exclude_list): raise ValueError( "the 'exclude_list' and 'run_on_single_node' options are not consistent." ) if self.exclude_list: slurm_resource_content += f" --exclude={self.exclude_list}" if self.run_on_single_node: slurm_resource_content += f" --nodelist={self.run_on_single_node}" if self.partition.name in (SCAVENGE, EPHEMERAL): slurm_resource_content += f" --qos={self.partition.name}" job.add_pegasus_profile( runtime=str(self.job_time_in_minutes * 60), queue=str(self.partition.name), project=_BORROWED_KEY if self.partition.name in (EPHEMERAL, SCAVENGE) else self.partition.name, glite_arguments=slurm_resource_content, ) if ("dagman" not in job.profiles.keys() or "CATEGORY" not in job.profiles["dagman"].keys()): job.add_dagman_profile(category=str(self.partition))
def run_bash( self, job_name: Locator, command: Union[Iterable[str], str], *, depends_on, resource_request: Optional[ResourceRequest] = None, category: Optional[str] = None, job_is_stageable: bool = False, job_bypass_staging: bool = False, times_to_retry_job: int = 0, job_profiles: Iterable[PegasusProfile] = immutableset(), container: Optional[Container] = None, path_to_bash: Path = BASH_EXECUTABLE_PATH, ) -> DependencyNode: """ Schedule a job to run the given *command* with the given *resource_request* If this job requires other jobs to be executed first, include them in *depends_on*. This method returns a `DependencyNode` which can be used in *depends_on* for future jobs. """ if isinstance(command, str): command = [command] commands_hashable = immutableset(command) signature = (job_name, commands_hashable) if signature in self._signature_to_job: logging.info("Job %s recognized as duplicate", job_name) return self._signature_to_job[signature] depends_on = _canonicalize_depends_on(depends_on) bash_transform = self._define_transformation( "bash", str(path_to_bash.absolute()), site=self._default_site, container=container, is_stageable=job_is_stageable, bypass_staging=job_bypass_staging, ).transformation job_dir = self.directory_for(job_name) ckpt_name = job_name / "___ckpt" ckpt_path = job_dir / "___ckpt" job_script = job_dir / "script.sh" commands_with_ckpt = list(command) commands_with_ckpt.append(f"touch {ckpt_path.absolute()}") commands_with_ckpt.insert(0, f"cd {job_dir}") job_script.write_text("\n".join(commands_with_ckpt)) resource_request = self.set_resource_request(resource_request) bash_job = Job(bash_transform) bash_job.add_args(str(job_script.absolute())) dependency_node = self._update_job_settings( category, ckpt_path, ckpt_name, depends_on, bash_job, job_name, job_profiles, resource_request, times_to_retry_job, ) self._signature_to_job[signature] = dependency_node logging.info("Scheduled bash job %s", job_name) return dependency_node
def run_container( self, job_name: Locator, docker_image_name: str, docker_args: str, docker_run_comand: str, docker_tar_path: str, *, depends_on, resource_request: Optional[ResourceRequest] = None, category: Optional[str] = None, pre_job_bash: str = "", post_job_bash: str = "", job_is_stageable: bool = False, job_bypass_staging: bool = True, times_to_retry_job: int = 0, job_profiles: Iterable[PegasusProfile] = immutableset(), ) -> DependencyNode: job_dir = self.directory_for(job_name) ckpt_name = job_name / "___ckpt" checkpoint_path = job_dir / "___ckpt" depends_on = _canonicalize_depends_on(depends_on) signature = (docker_image_name, docker_args) if signature in self._signature_to_job: logging.info("Job %s recognized as a duplicate", job_name) return self._signature_to_job[signature] script_path = job_dir / "___run.sh" # Part of one strategy to run a container through a bash script self._docker_script_generator.write_shell_script_to( docker_image_name=docker_image_name, docker_command=docker_run_comand, docker_tar_path=docker_tar_path, working_directory=job_dir, script_path=script_path, cmd_args=docker_args, ckpt_path=checkpoint_path, pre_job=pre_job_bash, post_job=post_job_bash, ) # TODO - Refactor this so it uses the BASH transformation to form a job # With the script path as an argument # https://github.com/isi-vista/vista-pegasus-wrapper/issues/103 script_executable = Transformation( self._job_name_for(job_name), namespace=self._namespace, version="4.0", site=self._default_site, pfn=script_path, is_stageable=job_is_stageable, bypass_staging=job_bypass_staging, arch=Arch.X86_64, os_type=OS.LINUX, ) self._transformation_catalog.add_transformations(script_executable) resource_request = self.set_resource_request(resource_request) job = Job(script_executable) dependency_node = self._update_job_settings( category, checkpoint_path, ckpt_name, depends_on, job, job_name, job_profiles, resource_request, times_to_retry_job, ) self._signature_to_job[signature] = dependency_node logging.info("Scheduled Docker job %s", job_name) return dependency_node
def _run_python_job( self, job_name: Locator, python_module_or_path: Any, args_or_params: Union[Parameters, Dict[str, Any], str], *, depends_on, resource_request: Optional[ResourceRequest] = None, override_conda_config: Optional[CondaConfiguration] = None, category: Optional[str] = None, use_pypy: bool = False, container: Optional[Container] = None, pre_job_bash: str = "", post_job_bash: str = "", job_is_stageable: bool = False, job_bypass_staging: bool = False, times_to_retry_job: int = 0, job_profiles: Iterable[PegasusProfile] = immutableset(), treat_params_as_cmd_args: bool = False, input_file_paths: Union[Iterable[Union[Path, str]], Path, str] = immutableset(), output_file_paths: Union[Iterable[Union[Path, str]], Path, str] = immutableset(), ) -> DependencyNode: """ Internal function to schedule a python job for centralized logic. """ job_dir = self.directory_for(job_name) ckpt_name = job_name / "___ckpt" checkpoint_path = job_dir / "___ckpt" signature_args = None depends_on = _canonicalize_depends_on(depends_on) if isinstance(python_module_or_path, (str, Path)): computed_module_or_path = python_module_or_path else: computed_module_or_path = fully_qualified_name( python_module_or_path) if not isinstance(args_or_params, str): # allow users to specify the parameters as a dict for convenience if not isinstance(args_or_params, Parameters): args_or_params = Parameters.from_mapping(args_or_params) params_sink = CharSink.to_string() YAMLParametersWriter().write(args_or_params, params_sink) signature_args = params_sink.last_string_written signature = ( computed_module_or_path, signature_args if signature_args else args_or_params, ) if signature in self._signature_to_job: logging.info("Job %s recognized as a duplicate", job_name) return self._signature_to_job[signature] if container: return self._run_python_in_container( job_name, computed_module_or_path, args_or_params, container, depends_on=depends_on, input_files=input_file_paths, output_files=output_file_paths, resource_request=resource_request, category=category, pre_docker_bash=pre_job_bash, post_docker_bash=post_job_bash, job_is_stageable=job_is_stageable, job_bypass_staging=job_bypass_staging, times_to_retry_job=times_to_retry_job, job_profiles=job_profiles, ) script_path = job_dir / "___run.sh" stdout_path = job_dir / "___stdout.log" self._conda_script_generator.write_shell_script_to( entry_point_name=computed_module_or_path, parameters=args_or_params, working_directory=job_dir, script_path=script_path, params_path=job_dir / "____params.params", stdout_file=stdout_path, ckpt_path=checkpoint_path, override_conda_config=override_conda_config, python="pypy3" if use_pypy else "python", pre_job=pre_job_bash, post_job=post_job_bash, treat_params_as_cmd_args=treat_params_as_cmd_args, ) script_executable = Transformation( self._job_name_for(job_name), namespace=self._namespace, version="4.0", site=self._default_site, pfn=script_path, is_stageable=job_is_stageable, bypass_staging=job_bypass_staging, arch=Arch.X86_64, os_type=OS.LINUX, container=container, ) self._transformation_catalog.add_transformations(script_executable) resource_request = self.set_resource_request(resource_request) job = Job(script_executable) dependency_node = self._update_job_settings( category, checkpoint_path, ckpt_name, depends_on, job, job_name, job_profiles, resource_request, times_to_retry_job, ) self._signature_to_job[signature] = dependency_node logging.info("Scheduled Python job %s", job_name) return dependency_node
def build_pegasus_wf(cwl_wf: cwl.Workflow, wf_files: dict, wf_input_str: dict) -> Workflow: log.info("Building Pegasus workflow") wf = Workflow("cwl-converted-pegasus-workflow", infer_dependencies=True) for step in cwl_wf.steps: step_name = get_basename(step.id) log.info("Processing step: {}".format(step_name)) cwl_cmd_ln_tool = (cwl.load_document(step.run) if isinstance( step.run, str) else step.run) job = Job(PurePath(cwl_cmd_ln_tool.baseCommand).name, _id=get_basename(step.id)) # collect current step inputs log.info("Collecting step inputs from {}".format(step_name)) step_inputs = dict() for _input in step.in_: input_id = get_basename(_input.id) step_inputs[input_id] = get_basename(_input.source) log.debug("step_inputs[{}] = {}".format(input_id, step_inputs[input_id])) # add inputs that are of type File for _input in cwl_cmd_ln_tool.inputs: if _input.type == "File": wf_file = File(wf_files[step_inputs[get_name( step.id, _input.id)]]) job.add_inputs(wf_file) log.info("Step: {} added input file: {}".format( step_name, wf_file.lfn)) """ # TODO: handle File[] inputs elif isinstance(_input.type, cwl.CommandInputArraySchema): if _input.type.items == "File": for f in step_inputs[get_name(step.id, _input.id)]: wf_file = File(wf_files[f]) job.add_inputs(wf_file) log.info( "Step: {} added input file: {}".format( step_name, wf_file.lfn ) ) """ # add job outputs that are of type File log.info("Collecting step outputs from {}".format(step_name)) for output in cwl_cmd_ln_tool.outputs: if output.type == "File": wf_file = File(wf_files[get_name(step.id, output.id)]) job.add_outputs(wf_file) log.info("Step: {} added output file: {}".format( step_name, wf_file.lfn)) else: raise NotImplementedError( "Support for output types other than File is in development" ) # add job args args = (cwl_cmd_ln_tool.arguments if cwl_cmd_ln_tool.arguments is not None else list()) # args will be added in the order of their assigned inputBinding def get_input_binding(_input): key = 0 if hasattr(_input, "inputBinding") and hasattr( _input.inputBinding, "position"): key = _input.inputBinding.position return key if key else 0 cwl_cmd_ln_tool_inputs = sorted(cwl_cmd_ln_tool.inputs, key=get_input_binding) for _input in cwl_cmd_ln_tool_inputs: # indicates whether or not input will appear in args if _input.inputBinding is not None: prefix = _input.inputBinding.prefix separate = _input.inputBinding.separate current_arg = "" if prefix: current_arg += prefix if separate: current_arg += " " if _input.type == "File": current_arg += wf_files[step_inputs[get_name( step.id, _input.id)]] elif _input.type == "string": current_arg += wf_input_str[step_inputs[get_name( step.id, _input.id)]] # TODO: provide better support for array inputs being used in args (see https://www.commonwl.org/user_guide/09-array-inputs/index.html) elif isinstance(_input.type, cwl.CommandInputArraySchema): separator = (" " if _input.inputBinding.itemSeparator is None else _input.inputBinding.itemSeparator) if _input.type.items == "File": current_arg += separator.join( wf_files[f] for f in step_inputs[get_name(step.id, _input.id)]) elif _input.type.items == "string": current_arg += separator.join( wf_input_str[step_inputs[get_name( step.id, _input.id)]]) args.append(current_arg) job.add_args(*args) wf.add_jobs(job) log.info("Added job: {}".format(step.run)) log.info("\tcmd: {}".format(job.transformation)) log.info("\targs: {}".format(job.args)) log.info("\tinputs: {}".format([f.lfn for f in job.get_inputs()])) log.info("\toutputs: {}".format([f.lfn for f in job.get_outputs()])) log.info("Building workflow complete. {} jobs added".format(len(wf.jobs))) return wf