Beispiel #1
0
def collect_files(cwl_wf: cwl.Workflow) -> dict:
    log.info("Collecting workflow files")
    wf_files = dict()

    log.info("Parsing input files from workflow inputs")
    for _input in cwl_wf.inputs:
        if _input.type == "File":
            f = get_basename(_input.id)
            wf_files[f] = f
            log.info("Collected input file: {}".format(f))
            log.debug("wf_files[{0}] = {0}".format(f))

        elif (isinstance(_input.type, cwl.InputArraySchema)
              and _input.type.items == "File"):

            raise NotImplementedError(
                "Support for File[] workflow input type in development")

    log.info("Parsing output files from each workflow step")
    for step in cwl_wf.steps:
        cwl_cmd_ln_tool = (cwl.load_document(step.run) if isinstance(
            step.run, str) else step.run)

        for output in cwl_cmd_ln_tool.outputs:
            if output.type == "File":
                k = get_name(step.id, output.id)

                try:
                    v = output.outputBinding.glob
                except AttributeError:
                    v = None
                finally:
                    if not v:
                        raise ValueError(
                            "outputBinding.glob must be specified (e.g. file1.txt) for {}"
                            .format(step.run))

                if any(c in "*$" for c in v):
                    raise NotImplementedError(
                        "Unable to resolve wildcards in {}".format(v))

                wf_files[k] = v
                log.info("Collected output file: {}".format(k))
                log.debug("wf_files[{}] = {}".format(k, v))
            else:
                raise NotImplementedError(
                    "Support for output types other than File is in development"
                )

    log.info(
        "Collection of workflow files complete. {} files collected".format(
            len(wf_files)))

    return wf_files
Beispiel #2
0
def main():
    args = parse_args()
    setup_logger(args.debug)

    cwl_wf = cwl.load_document(args.cwl_workflow_file_path)
    log.info("Loaded cwl workflow: {}".format(args.cwl_workflow_file_path))

    wf_inputs = load_wf_inputs(args.workflow_inputs_file_path)
    tr_specs = load_tr_specs(args.transformation_spec_file_path)

    tc = build_pegasus_tc(tr_specs, cwl_wf)
    rc = build_pegasus_rc(wf_inputs, cwl_wf)

    wf_files = collect_files(cwl_wf)
    wf_input_strings = collect_input_strings(wf_inputs, cwl_wf)
    wf = build_pegasus_wf(cwl_wf, wf_files, wf_input_strings)

    wf.add_transformation_catalog(tc)
    wf.add_replica_catalog(rc)

    wf.write(file=args.output_file_path)
    log.info("Workflow written to {}".format(args.output_file_path))

    return 0
Beispiel #3
0
def build_pegasus_wf(cwl_wf: cwl.Workflow, wf_files: dict,
                     wf_input_str: dict) -> Workflow:
    log.info("Building Pegasus workflow")

    wf = Workflow("cwl-converted-pegasus-workflow", infer_dependencies=True)

    for step in cwl_wf.steps:
        step_name = get_basename(step.id)
        log.info("Processing step: {}".format(step_name))
        cwl_cmd_ln_tool = (cwl.load_document(step.run) if isinstance(
            step.run, str) else step.run)

        job = Job(PurePath(cwl_cmd_ln_tool.baseCommand).name,
                  _id=get_basename(step.id))

        # collect current step inputs
        log.info("Collecting step inputs from {}".format(step_name))
        step_inputs = dict()
        for _input in step.in_:
            input_id = get_basename(_input.id)

            step_inputs[input_id] = get_basename(_input.source)
            log.debug("step_inputs[{}] = {}".format(input_id,
                                                    step_inputs[input_id]))

        # add inputs that are of type File
        for _input in cwl_cmd_ln_tool.inputs:
            if _input.type == "File":
                wf_file = File(wf_files[step_inputs[get_name(
                    step.id, _input.id)]])

                job.add_inputs(wf_file)
                log.info("Step: {} added input file: {}".format(
                    step_name, wf_file.lfn))
            """
            # TODO: handle File[] inputs
            elif isinstance(_input.type, cwl.CommandInputArraySchema):
                if _input.type.items == "File":
                    for f in step_inputs[get_name(step.id, _input.id)]:
                        wf_file = File(wf_files[f])

                        job.add_inputs(wf_file)
                        log.info(
                            "Step: {} added input file: {}".format(
                                step_name, wf_file.lfn
                            )
                        )
            """
        # add job outputs that are of type File
        log.info("Collecting step outputs from {}".format(step_name))
        for output in cwl_cmd_ln_tool.outputs:
            if output.type == "File":
                wf_file = File(wf_files[get_name(step.id, output.id)])

                job.add_outputs(wf_file)
                log.info("Step: {} added output file: {}".format(
                    step_name, wf_file.lfn))
            else:
                raise NotImplementedError(
                    "Support for output types other than File is in development"
                )

        # add job args
        args = (cwl_cmd_ln_tool.arguments
                if cwl_cmd_ln_tool.arguments is not None else list())

        # args will be added in the order of their assigned inputBinding
        def get_input_binding(_input):
            key = 0
            if hasattr(_input, "inputBinding") and hasattr(
                    _input.inputBinding, "position"):
                key = _input.inputBinding.position

            return key if key else 0

        cwl_cmd_ln_tool_inputs = sorted(cwl_cmd_ln_tool.inputs,
                                        key=get_input_binding)

        for _input in cwl_cmd_ln_tool_inputs:
            # indicates whether or not input will appear in args
            if _input.inputBinding is not None:
                prefix = _input.inputBinding.prefix
                separate = _input.inputBinding.separate

                current_arg = ""
                if prefix:
                    current_arg += prefix

                if separate:
                    current_arg += " "

                if _input.type == "File":
                    current_arg += wf_files[step_inputs[get_name(
                        step.id, _input.id)]]
                elif _input.type == "string":
                    current_arg += wf_input_str[step_inputs[get_name(
                        step.id, _input.id)]]

                # TODO: provide better support for array inputs being used in args (see https://www.commonwl.org/user_guide/09-array-inputs/index.html)
                elif isinstance(_input.type, cwl.CommandInputArraySchema):
                    separator = (" "
                                 if _input.inputBinding.itemSeparator is None
                                 else _input.inputBinding.itemSeparator)

                    if _input.type.items == "File":
                        current_arg += separator.join(
                            wf_files[f]
                            for f in step_inputs[get_name(step.id, _input.id)])
                    elif _input.type.items == "string":

                        current_arg += separator.join(
                            wf_input_str[step_inputs[get_name(
                                step.id, _input.id)]])

                args.append(current_arg)

        job.add_args(*args)
        wf.add_jobs(job)

        log.info("Added job: {}".format(step.run))
        log.info("\tcmd: {}".format(job.transformation))
        log.info("\targs: {}".format(job.args))
        log.info("\tinputs: {}".format([f.lfn for f in job.get_inputs()]))
        log.info("\toutputs: {}".format([f.lfn for f in job.get_outputs()]))

    log.info("Building workflow complete. {} jobs added".format(len(wf.jobs)))

    return wf
Beispiel #4
0
def build_pegasus_tc(tr_specs: dict,
                     cwl_wf: cwl.Workflow) -> TransformationCatalog:
    log.info("Building transformation catalog")
    tc = TransformationCatalog()

    for step in cwl_wf.steps:
        cwl_cmd_ln_tool = (cwl.load_document(step.run) if isinstance(
            step.run, str) else step.run)

        if cwl_cmd_ln_tool.baseCommand is None:
            raise ValueError("{} requires a 'baseCommand'".format(
                cwl_cmd_ln_tool.id))

        tool_path = PurePath(cwl_cmd_ln_tool.baseCommand)

        if not tool_path.is_absolute():
            raise ValueError(
                "{}.baseCommand: {} must be an absolute path".format(
                    cwl_cmd_ln_tool.id, cwl_cmd_ln_tool.baseCommand))

        log.debug("baseCommand: {}".format(tool_path))

        # TODO: handle requirements (may not be needed or can manually add them in)
        site = "local"
        is_stageable = True

        try:
            site = tr_specs[tool_path.name]["site"]
            is_stageable = tr_specs[tool_path.name]["is_stageable"]
        except KeyError:
            log.warning(
                "Unable to look up transformation: {} in transformation spec file. Using defaults: site='local', is_stageable=True"
                .format(tool_path.name))

        container_name = None
        if cwl_cmd_ln_tool.requirements:
            for req in cwl_cmd_ln_tool.requirements:
                if isinstance(req, cwl.DockerRequirement):
                    """
                    Currently not supported in DockerRequirement:
                    - dockerFile
                    - dockerImport
                    - dockerImageId
                    - dockerOutputDirectory
                    """

                    # assume to be docker because we can't distinguish between
                    # docker and singularity just by image name or url of zipped
                    # file
                    if req.dockerPull:
                        container_name = req.dockerPull

                        # planner won't allow deep lfn (for example: opensicencegrid/osg-el7 is invalid)
                        container_name = container_name.replace("/", "_")
                        image = "docker://" + req.dockerPull
                    elif req.dockerLoad:
                        image = req.dockerLoad
                        container_name = Path(req.dockerLoad).name
                    else:
                        raise NotImplementedError(
                            "Only DockerRequirement.dockerPull and DockerRequirement.dockerLoad currently supported"
                        )

                    try:
                        tc.add_containers(
                            Container(
                                container_name,
                                Container.DOCKER,
                                image,
                                image_site="local",
                            ))
                    except DuplicateError:
                        pass

                    log.info(
                        "Added <Container name={}, container_type='docker', image={}, image_site='local'> from CommandLineTool: {}"
                        .format(container_name, image, cwl_cmd_ln_tool.id))
                    log.warning(
                        "Container types in the transformation catalog will need to be modified if containers are not of type: docker or if image file exists on a site other than 'local'"
                    )

                    #
                    break

        tr = Transformation(
            tool_path.name,
            site=site,
            pfn=str(tool_path),
            is_stageable=is_stageable,
            container=container_name,
        )
        log.debug(
            "tr = Transformation({}, site={}, pfn={}, is_stageable={})".format(
                tool_path.name, site, str(tool_path), is_stageable))
        log.info("Adding <Transformation {}>".format(tr.name))

        try:
            tc.add_transformations(tr)
        except DuplicateError:
            log.warning(
                "<Transformation {}> is a duplicate and has already been added."
                .format(tr.name))

    log.info(
        "Building transformation catalog complete. {} transformations, {} containers added."
        .format(len(tc.transformations), len(tc.containers)))

    return tc