Ejemplo n.º 1
0
    def _update_job_settings(
        self,
        category: str,
        checkpoint_path: Path,
        ckpt_name: Locator,
        depends_on,
        job: Job,
        job_name: Locator,
        job_profiles: Iterable[PegasusProfile],
        resource_request: ResourceRequest,
        times_to_retry_job: int,
    ) -> DependencyNode:
        """
        Apply a variety of shared settings to a job.

        Centralized logic for multiple job types to use.
        """
        self._job_graph.add_jobs(job)

        # Configure SLURM resource request
        resource_request.apply_to_job(job,
                                      job_name=self._job_name_for(job_name))

        # Set the DAGMAN category to potentially limit the number of active jobs
        job.add_dagman_profile(category=category,
                               retry=str(times_to_retry_job))

        # Apply other user defined pegasus profiles
        for profile in job_profiles:
            job.add_profiles(profile.namespace,
                             key=profile.key,
                             value=profile.value)

        # Handle depedent job additions from the `depends_on` variable
        for parent_dependency in depends_on:
            if parent_dependency.job:
                self._job_graph.add_dependency(job,
                                               parents=[parent_dependency.job])
            for out_file in parent_dependency.output_files:
                job.add_inputs(out_file)

        # Handle Output Files
        # This is currently only handled as the checkpoint file
        # See: https://github.com/isi-vista/vista-pegasus-wrapper/issues/25
        # If the checkpoint file already exists, we want to add it to the replica catalog
        # so that we don't run the job corresponding to the checkpoint file again
        checkpoint_pegasus_file = self.create_file(
            f"{ckpt_name}",
            checkpoint_path,
            add_to_catalog=checkpoint_path.exists())
        job.add_outputs(checkpoint_pegasus_file, stage_out=False)

        return DependencyNode.from_job(job,
                                       output_files=[checkpoint_pegasus_file])
    def apply_to_job(self, job: Job, *, job_name: str) -> None:
        if not self.partition:
            raise RuntimeError("A partition to run on must be specified.")

        if self.partition.max_walltime < self.job_time_in_minutes:
            raise ValueError(
                f"Partition '{self.partition.name}' has a max walltime of {self.partition.max_walltime} mins, which is less than the time given ({self.job_time_in_minutes} mins) for job: {job_name}."
            )

        slurm_resource_content = SLURM_RESOURCE_STRING.format(
            num_cpus=self.num_cpus or 1,
            num_gpus=self.num_gpus if self.num_gpus is not None else 0,
            job_name=job_name,
            mem_str=to_slurm_memory_string(self.memory
                                           or _SLURM_DEFAULT_MEMORY),
        )

        if (self.exclude_list and self.run_on_single_node
                and self.run_on_single_node in self.exclude_list):
            raise ValueError(
                "the 'exclude_list' and 'run_on_single_node' options are not consistent."
            )

        if self.exclude_list:
            slurm_resource_content += f" --exclude={self.exclude_list}"

        if self.run_on_single_node:
            slurm_resource_content += f" --nodelist={self.run_on_single_node}"

        if self.partition.name in (SCAVENGE, EPHEMERAL):
            slurm_resource_content += f" --qos={self.partition.name}"

        job.add_pegasus_profile(
            runtime=str(self.job_time_in_minutes * 60),
            queue=str(self.partition.name),
            project=_BORROWED_KEY if self.partition.name
            in (EPHEMERAL, SCAVENGE) else self.partition.name,
            glite_arguments=slurm_resource_content,
        )

        if ("dagman" not in job.profiles.keys()
                or "CATEGORY" not in job.profiles["dagman"].keys()):
            job.add_dagman_profile(category=str(self.partition))
Ejemplo n.º 3
0
    def run_bash(
        self,
        job_name: Locator,
        command: Union[Iterable[str], str],
        *,
        depends_on,
        resource_request: Optional[ResourceRequest] = None,
        category: Optional[str] = None,
        job_is_stageable: bool = False,
        job_bypass_staging: bool = False,
        times_to_retry_job: int = 0,
        job_profiles: Iterable[PegasusProfile] = immutableset(),
        container: Optional[Container] = None,
        path_to_bash: Path = BASH_EXECUTABLE_PATH,
    ) -> DependencyNode:
        """
        Schedule a job to run the given *command* with the given *resource_request*

        If this job requires other jobs to be executed first,
        include them in *depends_on*.

        This method returns a `DependencyNode` which can be used in *depends_on*
        for future jobs.
        """
        if isinstance(command, str):
            command = [command]

        commands_hashable = immutableset(command)

        signature = (job_name, commands_hashable)
        if signature in self._signature_to_job:
            logging.info("Job %s recognized as duplicate", job_name)
            return self._signature_to_job[signature]

        depends_on = _canonicalize_depends_on(depends_on)

        bash_transform = self._define_transformation(
            "bash",
            str(path_to_bash.absolute()),
            site=self._default_site,
            container=container,
            is_stageable=job_is_stageable,
            bypass_staging=job_bypass_staging,
        ).transformation

        job_dir = self.directory_for(job_name)
        ckpt_name = job_name / "___ckpt"
        ckpt_path = job_dir / "___ckpt"
        job_script = job_dir / "script.sh"

        commands_with_ckpt = list(command)
        commands_with_ckpt.append(f"touch {ckpt_path.absolute()}")
        commands_with_ckpt.insert(0, f"cd {job_dir}")

        job_script.write_text("\n".join(commands_with_ckpt))
        resource_request = self.set_resource_request(resource_request)

        bash_job = Job(bash_transform)
        bash_job.add_args(str(job_script.absolute()))
        dependency_node = self._update_job_settings(
            category,
            ckpt_path,
            ckpt_name,
            depends_on,
            bash_job,
            job_name,
            job_profiles,
            resource_request,
            times_to_retry_job,
        )

        self._signature_to_job[signature] = dependency_node
        logging.info("Scheduled bash job %s", job_name)

        return dependency_node
Ejemplo n.º 4
0
    def run_container(
        self,
        job_name: Locator,
        docker_image_name: str,
        docker_args: str,
        docker_run_comand: str,
        docker_tar_path: str,
        *,
        depends_on,
        resource_request: Optional[ResourceRequest] = None,
        category: Optional[str] = None,
        pre_job_bash: str = "",
        post_job_bash: str = "",
        job_is_stageable: bool = False,
        job_bypass_staging: bool = True,
        times_to_retry_job: int = 0,
        job_profiles: Iterable[PegasusProfile] = immutableset(),
    ) -> DependencyNode:

        job_dir = self.directory_for(job_name)
        ckpt_name = job_name / "___ckpt"
        checkpoint_path = job_dir / "___ckpt"
        depends_on = _canonicalize_depends_on(depends_on)

        signature = (docker_image_name, docker_args)
        if signature in self._signature_to_job:
            logging.info("Job %s recognized as a duplicate", job_name)
            return self._signature_to_job[signature]

        script_path = job_dir / "___run.sh"

        # Part of one strategy to run a container through a bash script
        self._docker_script_generator.write_shell_script_to(
            docker_image_name=docker_image_name,
            docker_command=docker_run_comand,
            docker_tar_path=docker_tar_path,
            working_directory=job_dir,
            script_path=script_path,
            cmd_args=docker_args,
            ckpt_path=checkpoint_path,
            pre_job=pre_job_bash,
            post_job=post_job_bash,
        )

        # TODO - Refactor this so it uses the BASH transformation to form a job
        # With the script path as an argument
        # https://github.com/isi-vista/vista-pegasus-wrapper/issues/103
        script_executable = Transformation(
            self._job_name_for(job_name),
            namespace=self._namespace,
            version="4.0",
            site=self._default_site,
            pfn=script_path,
            is_stageable=job_is_stageable,
            bypass_staging=job_bypass_staging,
            arch=Arch.X86_64,
            os_type=OS.LINUX,
        )

        self._transformation_catalog.add_transformations(script_executable)
        resource_request = self.set_resource_request(resource_request)

        job = Job(script_executable)
        dependency_node = self._update_job_settings(
            category,
            checkpoint_path,
            ckpt_name,
            depends_on,
            job,
            job_name,
            job_profiles,
            resource_request,
            times_to_retry_job,
        )
        self._signature_to_job[signature] = dependency_node

        logging.info("Scheduled Docker job %s", job_name)
        return dependency_node
Ejemplo n.º 5
0
    def _run_python_job(
        self,
        job_name: Locator,
        python_module_or_path: Any,
        args_or_params: Union[Parameters, Dict[str, Any], str],
        *,
        depends_on,
        resource_request: Optional[ResourceRequest] = None,
        override_conda_config: Optional[CondaConfiguration] = None,
        category: Optional[str] = None,
        use_pypy: bool = False,
        container: Optional[Container] = None,
        pre_job_bash: str = "",
        post_job_bash: str = "",
        job_is_stageable: bool = False,
        job_bypass_staging: bool = False,
        times_to_retry_job: int = 0,
        job_profiles: Iterable[PegasusProfile] = immutableset(),
        treat_params_as_cmd_args: bool = False,
        input_file_paths: Union[Iterable[Union[Path, str]], Path,
                                str] = immutableset(),
        output_file_paths: Union[Iterable[Union[Path, str]], Path,
                                 str] = immutableset(),
    ) -> DependencyNode:
        """
        Internal function to schedule a python job for centralized logic.
        """
        job_dir = self.directory_for(job_name)
        ckpt_name = job_name / "___ckpt"
        checkpoint_path = job_dir / "___ckpt"
        signature_args = None
        depends_on = _canonicalize_depends_on(depends_on)

        if isinstance(python_module_or_path, (str, Path)):
            computed_module_or_path = python_module_or_path
        else:
            computed_module_or_path = fully_qualified_name(
                python_module_or_path)

        if not isinstance(args_or_params, str):
            # allow users to specify the parameters as a dict for convenience
            if not isinstance(args_or_params, Parameters):
                args_or_params = Parameters.from_mapping(args_or_params)

            params_sink = CharSink.to_string()
            YAMLParametersWriter().write(args_or_params, params_sink)
            signature_args = params_sink.last_string_written

        signature = (
            computed_module_or_path,
            signature_args if signature_args else args_or_params,
        )
        if signature in self._signature_to_job:
            logging.info("Job %s recognized as a duplicate", job_name)
            return self._signature_to_job[signature]

        if container:
            return self._run_python_in_container(
                job_name,
                computed_module_or_path,
                args_or_params,
                container,
                depends_on=depends_on,
                input_files=input_file_paths,
                output_files=output_file_paths,
                resource_request=resource_request,
                category=category,
                pre_docker_bash=pre_job_bash,
                post_docker_bash=post_job_bash,
                job_is_stageable=job_is_stageable,
                job_bypass_staging=job_bypass_staging,
                times_to_retry_job=times_to_retry_job,
                job_profiles=job_profiles,
            )

        script_path = job_dir / "___run.sh"
        stdout_path = job_dir / "___stdout.log"

        self._conda_script_generator.write_shell_script_to(
            entry_point_name=computed_module_or_path,
            parameters=args_or_params,
            working_directory=job_dir,
            script_path=script_path,
            params_path=job_dir / "____params.params",
            stdout_file=stdout_path,
            ckpt_path=checkpoint_path,
            override_conda_config=override_conda_config,
            python="pypy3" if use_pypy else "python",
            pre_job=pre_job_bash,
            post_job=post_job_bash,
            treat_params_as_cmd_args=treat_params_as_cmd_args,
        )

        script_executable = Transformation(
            self._job_name_for(job_name),
            namespace=self._namespace,
            version="4.0",
            site=self._default_site,
            pfn=script_path,
            is_stageable=job_is_stageable,
            bypass_staging=job_bypass_staging,
            arch=Arch.X86_64,
            os_type=OS.LINUX,
            container=container,
        )

        self._transformation_catalog.add_transformations(script_executable)
        resource_request = self.set_resource_request(resource_request)

        job = Job(script_executable)
        dependency_node = self._update_job_settings(
            category,
            checkpoint_path,
            ckpt_name,
            depends_on,
            job,
            job_name,
            job_profiles,
            resource_request,
            times_to_retry_job,
        )
        self._signature_to_job[signature] = dependency_node

        logging.info("Scheduled Python job %s", job_name)
        return dependency_node
Ejemplo n.º 6
0
def build_pegasus_wf(cwl_wf: cwl.Workflow, wf_files: dict,
                     wf_input_str: dict) -> Workflow:
    log.info("Building Pegasus workflow")

    wf = Workflow("cwl-converted-pegasus-workflow", infer_dependencies=True)

    for step in cwl_wf.steps:
        step_name = get_basename(step.id)
        log.info("Processing step: {}".format(step_name))
        cwl_cmd_ln_tool = (cwl.load_document(step.run) if isinstance(
            step.run, str) else step.run)

        job = Job(PurePath(cwl_cmd_ln_tool.baseCommand).name,
                  _id=get_basename(step.id))

        # collect current step inputs
        log.info("Collecting step inputs from {}".format(step_name))
        step_inputs = dict()
        for _input in step.in_:
            input_id = get_basename(_input.id)

            step_inputs[input_id] = get_basename(_input.source)
            log.debug("step_inputs[{}] = {}".format(input_id,
                                                    step_inputs[input_id]))

        # add inputs that are of type File
        for _input in cwl_cmd_ln_tool.inputs:
            if _input.type == "File":
                wf_file = File(wf_files[step_inputs[get_name(
                    step.id, _input.id)]])

                job.add_inputs(wf_file)
                log.info("Step: {} added input file: {}".format(
                    step_name, wf_file.lfn))
            """
            # TODO: handle File[] inputs
            elif isinstance(_input.type, cwl.CommandInputArraySchema):
                if _input.type.items == "File":
                    for f in step_inputs[get_name(step.id, _input.id)]:
                        wf_file = File(wf_files[f])

                        job.add_inputs(wf_file)
                        log.info(
                            "Step: {} added input file: {}".format(
                                step_name, wf_file.lfn
                            )
                        )
            """
        # add job outputs that are of type File
        log.info("Collecting step outputs from {}".format(step_name))
        for output in cwl_cmd_ln_tool.outputs:
            if output.type == "File":
                wf_file = File(wf_files[get_name(step.id, output.id)])

                job.add_outputs(wf_file)
                log.info("Step: {} added output file: {}".format(
                    step_name, wf_file.lfn))
            else:
                raise NotImplementedError(
                    "Support for output types other than File is in development"
                )

        # add job args
        args = (cwl_cmd_ln_tool.arguments
                if cwl_cmd_ln_tool.arguments is not None else list())

        # args will be added in the order of their assigned inputBinding
        def get_input_binding(_input):
            key = 0
            if hasattr(_input, "inputBinding") and hasattr(
                    _input.inputBinding, "position"):
                key = _input.inputBinding.position

            return key if key else 0

        cwl_cmd_ln_tool_inputs = sorted(cwl_cmd_ln_tool.inputs,
                                        key=get_input_binding)

        for _input in cwl_cmd_ln_tool_inputs:
            # indicates whether or not input will appear in args
            if _input.inputBinding is not None:
                prefix = _input.inputBinding.prefix
                separate = _input.inputBinding.separate

                current_arg = ""
                if prefix:
                    current_arg += prefix

                if separate:
                    current_arg += " "

                if _input.type == "File":
                    current_arg += wf_files[step_inputs[get_name(
                        step.id, _input.id)]]
                elif _input.type == "string":
                    current_arg += wf_input_str[step_inputs[get_name(
                        step.id, _input.id)]]

                # TODO: provide better support for array inputs being used in args (see https://www.commonwl.org/user_guide/09-array-inputs/index.html)
                elif isinstance(_input.type, cwl.CommandInputArraySchema):
                    separator = (" "
                                 if _input.inputBinding.itemSeparator is None
                                 else _input.inputBinding.itemSeparator)

                    if _input.type.items == "File":
                        current_arg += separator.join(
                            wf_files[f]
                            for f in step_inputs[get_name(step.id, _input.id)])
                    elif _input.type.items == "string":

                        current_arg += separator.join(
                            wf_input_str[step_inputs[get_name(
                                step.id, _input.id)]])

                args.append(current_arg)

        job.add_args(*args)
        wf.add_jobs(job)

        log.info("Added job: {}".format(step.run))
        log.info("\tcmd: {}".format(job.transformation))
        log.info("\targs: {}".format(job.args))
        log.info("\tinputs: {}".format([f.lfn for f in job.get_inputs()]))
        log.info("\toutputs: {}".format([f.lfn for f in job.get_outputs()]))

    log.info("Building workflow complete. {} jobs added".format(len(wf.jobs)))

    return wf