Exemple #1
0
def _write_tool(step_dir, name, inputs, outputs, parallel):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    out = {"class": "CommandLineTool",
           "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
           "inputs": [],
           "outputs": []}
    if not parallel:
        inputs = [{"id": "#sentinel", "type": {"type": "array", "items": "string"},
                   "default": ["multisample"]}] + inputs
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = "#%s" % base_id
        inp_binding = {"prefix": "%s=" % base_id, "separate": False,
                       "itemSeparator": ";;", "position": i}
        if "secondaryFiles" in inp_tool:
            inp_binding["secondaryFiles"] = inp_tool.pop("secondaryFiles")
        if parallel:
            inp_tool["inputBinding"] = inp_binding
        else:
            inp_tool["type"]["inputBinding"] = inp_binding
        out["inputs"].append(inp_tool)
    # XXX Need to generalize outputs, just a hack for now to test align_prep
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"])
        out["outputs"].append(outp_tool)
    with open(out_file, "w") as out_handle:
        def str_presenter(dumper, data):
            if len(data.splitlines()) > 1:  # check for multiline string
                return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
            return dumper.represent_scalar('tag:yaml.org,2002:str', data)
        yaml.add_representer(str, str_presenter)
        yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #2
0
def _step_template(name, run_file, inputs, outputs, parallel, scatter=None):
    """Templating function for writing a step to avoid repeating namespaces.
    """
    scatter_inputs = []
    sinputs = []
    for inp in inputs:
        step_inp = {"id": workflow.get_base_id(inp["id"]), "source": inp["id"]}
        if inp.get("wf_duplicate"):
            step_inp["id"] += "_toolinput"
        for attr in ["source", "valueFrom"]:
            if attr in inp:
                step_inp[attr] = inp[attr]
        sinputs.append(step_inp)
        # scatter on inputs from previous processes that have been arrayed
        if (_is_scatter_parallel(parallel) and (_do_scatter_var(inp, parallel)
                                                or (scatter and inp["id"] in scatter))):
            scatter_inputs.append(step_inp["id"])
    out = {"run": run_file,
           "id": name,
           "in": sinputs,
           "out": [{"id": workflow.get_base_id(output["id"])} for output in outputs]}
    if _is_scatter_parallel(parallel):
        assert scatter_inputs, "Did not find items to scatter on: %s" % name
        out.update({"scatterMethod": "dotproduct",
                    "scatter": scatter_inputs})
    return out
Exemple #3
0
def _write_tool(step_dir, name, inputs, outputs, parallel):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    out = {"class": "CommandLineTool",
           "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
           "inputs": [],
           "outputs": []}
    pinputs = [{"id": "#sentinel-parallel", "type": "string",
                "default": parallel}]
    inputs = pinputs + inputs
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = "#%s" % base_id
        inp_binding = {"prefix": "%s=" % base_id, "separate": False,
                       "itemSeparator": ";;", "position": i}
        inp_tool = _place_input_binding(inp_tool, inp_binding, parallel)
        inp_tool = _place_secondary_files(inp_tool, inp_binding)
        out["inputs"].append(inp_tool)
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"])
        out["outputs"].append(outp_tool)
    with open(out_file, "w") as out_handle:
        def str_presenter(dumper, data):
            if len(data.splitlines()) > 1:  # check for multiline string
                return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
            return dumper.represent_scalar('tag:yaml.org,2002:str', data)
        yaml.add_representer(str, str_presenter)
        yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #4
0
def _write_tool(step_dir, name, inputs, outputs, parallel, programs, file_estimates, disk, samples):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    cores, mem_gb_per_core = resources.cpu_and_memory((programs or []) + ["default"], samples)
    mem_mb_total = int(mem_gb_per_core * cores * 1024)
    cwl_res = {"class": "ResourceRequirement",
               "coresMin": cores, "ramMin": mem_mb_total}
    if file_estimates and disk:
        total_estimate = 0
        for key, multiplier in disk.items():
            if key in file_estimates:
                total_estimate += int(multiplier * file_estimates[key])
        if total_estimate:
            cwl_res["tmpdirMin"] = total_estimate
    out = {"class": "CommandLineTool",
           "cwlVersion": "cwl:draft-3",
           "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
           "hints": [cwl_res],
           "arguments": [],
           "inputs": [],
           "outputs": []}
    out["arguments"].append({"position": 0, "valueFrom": "sentinel-runtime=$(runtime)"})
    std_inputs = [{"id": "#sentinel-parallel", "type": "string",
                   "default": parallel},
                  {"id": "#sentinel-outputs", "type": "string",
                   "default": json.dumps([workflow.get_base_id(x["id"]) for x in outputs],
                                         sort_keys=True, separators=(',', ':'))}]
    inputs = std_inputs + inputs
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = "#%s" % base_id
        for attr in ["source", "valueFrom"]:
            inp_tool.pop(attr, None)
        inp_binding = {"prefix": "%s=" % base_id, "separate": False,
                       "itemSeparator": ";;", "position": i}
        inp_tool = _place_input_binding(inp_tool, inp_binding, parallel)
        inp_tool = _place_secondary_files(inp_tool, inp_binding)
        out["inputs"].append(inp_tool)
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"])
        out["outputs"].append(outp_tool)
    with open(out_file, "w") as out_handle:
        def str_presenter(dumper, data):
            if len(data.splitlines()) > 1:  # check for multiline string
                return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
            return dumper.represent_scalar('tag:yaml.org,2002:str', data)
        yaml.add_representer(str, str_presenter)
        yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #5
0
def _add_outputs_to_tool(outputs, tool):
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool = _clean_record(outp_tool)
        outp_tool["id"] = workflow.get_base_id(outp["id"])
        tool["outputs"].append(outp_tool)
    return tool
Exemple #6
0
def _get_sentinel_val(v):
    """Retrieve expected sentinel value for an output, expanding records.
    """
    out = workflow.get_base_id(v["id"])
    if workflow.is_cwl_record(v):
        out += ":%s" % ";".join([x["name"] for x in _get_record_fields(v)])
    return out
Exemple #7
0
def _write_tool(step_dir, name, inputs, outputs, parallel):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    out = {"class": "CommandLineTool",
           "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
           "inputs": [],
           "outputs": []}
    pinputs = [{"id": "#sentinel-pin", "type": {"type": "array", "items": "string"},
                "default": [parallel.input]},
               {"id": "#sentinel-pout", "type": {"type": "array", "items": "string"},
                "default": [parallel.output]}]
    inputs = pinputs + inputs
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = "#%s" % base_id
        inp_binding = {"prefix": "%s=" % base_id, "separate": False,
                       "itemSeparator": ";;", "position": i}
        if "secondaryFiles" in inp_tool:
            # if we have a nested list of files, ensure we pass the index for each
            # Need a second input binding we ignore to get the secondaryFiles
            # XXX Ideally could use `valueFrom: null` but that doesn't seem to work
            if parallel.baseline in ["single", "merge"] and tz.get_in(["type", "type"], inp_tool) == "array":
                nested_inp_binding = copy.deepcopy(inp_binding)
                nested_inp_binding["prefix"] = "ignore="
                nested_inp_binding["secondaryFiles"] = inp_tool.pop("secondaryFiles")
                inp_tool["type"]["inputBinding"] = nested_inp_binding
            # otherwise, add it at the top level
            else:
                inp_binding["secondaryFiles"] = inp_tool.pop("secondaryFiles")
        if parallel.baseline in ["single", "merge"] or not isinstance(inp_tool["type"], dict):
            inp_tool["inputBinding"] = inp_binding
        else:
            inp_tool["type"]["inputBinding"] = inp_binding
        out["inputs"].append(inp_tool)
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"])
        out["outputs"].append(outp_tool)
    with open(out_file, "w") as out_handle:
        def str_presenter(dumper, data):
            if len(data.splitlines()) > 1:  # check for multiline string
                return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
            return dumper.represent_scalar('tag:yaml.org,2002:str', data)
        yaml.add_representer(str, str_presenter)
        yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #8
0
def _write_tool(step_dir, name, inputs, outputs, parallel, programs, samples):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    cores, mem_gb_per_core = resources.cpu_and_memory(programs if programs else ["default"], samples)
    mem_mb_total = int(mem_gb_per_core * cores * 1024)
    out = {"class": "CommandLineTool",
           "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
           "hints": [{"class": "ResourceRequirement",
                      "coresMin": cores, "ramMin": mem_mb_total}],
           "arguments": [],
           "inputs": [],
           "outputs": []}
    out["arguments"].append({"position": 0, "prefix": "sentinel-runtime=", "separate": False,
                             "valueFrom": "$(JSON.stringify(runtime))"})
    std_inputs = [{"id": "#sentinel-parallel", "type": "string",
                   "default": parallel}]
    inputs = std_inputs + inputs
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = "#%s" % base_id
        for attr in ["source", "valueFrom"]:
            inp_tool.pop(attr, None)
        inp_binding = {"prefix": "%s=" % base_id, "separate": False,
                       "itemSeparator": ";;", "position": i}
        inp_tool = _place_input_binding(inp_tool, inp_binding, parallel)
        inp_tool = _place_secondary_files(inp_tool, inp_binding)
        out["inputs"].append(inp_tool)
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"])
        out["outputs"].append(outp_tool)
    with open(out_file, "w") as out_handle:
        def str_presenter(dumper, data):
            if len(data.splitlines()) > 1:  # check for multiline string
                return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
            return dumper.represent_scalar('tag:yaml.org,2002:str', data)
        yaml.add_representer(str, str_presenter)
        yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #9
0
def _step_template(name, run_file, inputs, outputs, parallel):
    """Templating function for writing a step to avoid repeating namespaces.
    """
    scatter_inputs = []
    sinputs = []
    for inp in inputs:
        step_inp = {"id": workflow.get_base_id(inp["id"]), "source": inp["id"]}
        for attr in ["source", "valueFrom"]:
            if attr in inp:
                step_inp[attr] = inp[attr]
        sinputs.append(step_inp)
        # scatter on inputs from previous processes that have been arrayed
        if parallel in "multi-parallel" or len(inp["id"].split("/")) > 1:
            scatter_inputs.append("%s/%s" % (name, step_inp["id"]))
    out = {"run": run_file,
           "id": name,
           "in": sinputs,
           "out": [{"id": workflow.get_base_id(output["id"])} for output in outputs]}
    if parallel in ["single-parallel", "multi-parallel", "batch-parallel"]:
        out.update({"scatterMethod": "dotproduct",
                    "scatter": scatter_inputs})
    return out
Exemple #10
0
def _step_template(name, step_dir, inputs, outputs, parallel):
    """Templating function for writing a step to avoid repeating namespaces.
    """
    step_file = _write_tool(step_dir, name, inputs, outputs, parallel)
    inputs = [{"id": "#%s.%s" % (name, workflow.get_base_id(inp["id"])), "source": inp["id"]}
              for inp in inputs]
    out = {"run": {"import": step_file},
           "id": "#%s" % name,
           "inputs": inputs,
           "outputs": [{"id": output["id"]} for output in outputs]}
    if parallel:
        out.update({"scatterMethod": "dotproduct",
                    "scatter": [x["id"] for x in inputs]})
    return out
Exemple #11
0
def _get_sentinel_val(v):
    """Retrieve expected sentinel value for an output, expanding records.
    """
    out = workflow.get_base_id(v["id"])
    if workflow.is_cwl_record(v):
        def _get_fields(d):
            if isinstance(d, dict):
                if "fields" in d:
                    return d["fields"]
                else:
                    for v in d.values():
                        fields = _get_fields(v)
                        if fields:
                            return fields
        out += ":%s" % ";".join([x["name"] for x in _get_fields(v)])
    return out
Exemple #12
0
def _get_sentinel_val(v):
    """Retrieve expected sentinel value for an output, expanding records.
    """
    out = workflow.get_base_id(v["id"])
    if workflow.is_cwl_record(v):

        def _get_fields(d):
            if isinstance(d, dict):
                if "fields" in d:
                    return d["fields"]
                else:
                    for v in d.values():
                        fields = _get_fields(v)
                        if fields:
                            return fields

        out += ":%s" % ";".join([x["name"] for x in _get_fields(v)])
    return out
Exemple #13
0
def _step_template(name, run_file, inputs, outputs, parallel):
    """Templating function for writing a step to avoid repeating namespaces.
    """
    scatter_inputs = []
    sinputs = []
    for inp in inputs:
        step_inp = {"id": "#%s.%s" % (name, workflow.get_base_id(inp["id"])), "source": inp["id"]}
        sinputs.append(step_inp)
        # scatter on inputs from previous processes that have been arrayed
        if parallel.baseline == "multi" or len(inp["id"].split(".")) > 1:
            scatter_inputs.append(step_inp["id"])
    out = {"run": {"import": run_file},
           "id": "#%s" % name,
           "inputs": sinputs,
           "outputs": [{"id": output["id"]} for output in outputs]}
    if parallel.input in ["batch"] and parallel.baseline in ["single", "multi"]:
        out.update({"scatterMethod": "dotproduct",
                    "scatter": scatter_inputs})
    return out
Exemple #14
0
def _step_template(name, step_dir, inputs, outputs, source=""):
    """Templating function for writing a step to avoid repeating namespaces.
    """
    step_file = _write_tool(step_dir, name, inputs, outputs)
    inputs = [{
        "id": "#%s.%s" % (name, workflow.get_base_id(inp["id"])),
        "source": inp["id"]
    } for inp in inputs]
    return {
        "run": {
            "import": step_file
        },
        "id": "#%s" % name,
        "scatterMethod": "dotproduct",
        "scatter": [x["id"] for x in inputs],
        "inputs": inputs,
        "outputs": [{
            "id": output["id"]
        } for output in outputs]
    }
Exemple #15
0
def _add_inputs_to_tool(inputs, tool, parallel, use_commandline_args=False):
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = base_id
        if inp.get("wf_duplicate"):
            inp_tool["id"] += "_toolinput"
        for attr in ["source", "valueFrom", "wf_duplicate"]:
            inp_tool.pop(attr, None)
        # Ensure records and workflow inputs get scattered
        if (_is_scatter_parallel(parallel) and _do_scatter_var(inp, parallel) and
              (workflow.is_cwl_record(inp) or inp["wf_duplicate"])):
            inp_tool = workflow._flatten_nested_input(inp_tool)
        if use_commandline_args:
            inp_binding = {"prefix": "%s=" % base_id,
                           "separate": False, "itemSeparator": ";;", "position": i}
            inp_tool = _place_input_binding(inp_tool, inp_binding, parallel)
        else:
            inp_binding = None
        inp_tool = _place_secondary_files(inp_tool, inp_binding)
        inp_tool = _clean_record(inp_tool)
        tool["inputs"].append(inp_tool)
    return tool
Exemple #16
0
def _add_inputs_to_tool(inputs, tool, parallel, use_commandline_args=False):
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = base_id
        if inp.get("wf_duplicate"):
            inp_tool["id"] += "_toolinput"
        for attr in ["source", "valueFrom", "wf_duplicate"]:
            inp_tool.pop(attr, None)
        # Ensure records and workflow inputs get scattered
        if (_is_scatter_parallel(parallel) and _do_scatter_var(inp, parallel) and
              (workflow.is_cwl_record(inp) or inp["wf_duplicate"])):
            inp_tool = workflow._flatten_nested_input(inp_tool)
        if use_commandline_args:
            inp_binding = {"prefix": "%s=" % base_id,
                           "separate": False, "itemSeparator": ";;", "position": i}
            inp_tool = _place_input_binding(inp_tool, inp_binding, parallel)
        else:
            inp_binding = None
        inp_tool = _place_secondary_files(inp_tool, inp_binding)
        inp_tool = _clean_record(inp_tool)
        tool["inputs"].append(inp_tool)
    return tool
Exemple #17
0
def _write_tool(step_dir, name, inputs, outputs, parallel, image, programs,
                file_estimates, disk, step_cores, samples):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    resource_cores, mem_gb_per_core = resources.cpu_and_memory((programs or []) + ["default"], samples)
    cores = step_cores if step_cores else resource_cores
    mem_mb_total = int(mem_gb_per_core * cores * 1024)
    bcbio_docker_disk = 1 * 1024  # Minimum requirements for bcbio Docker image
    cwl_res = {"class": "ResourceRequirement",
               "coresMin": cores, "ramMin": mem_mb_total, "outdirMin": bcbio_docker_disk}
    docker_image = "bcbio/bcbio" if image == "bcbio" else "quay.io/bcbio/%s" % image
    docker = {"class": "DockerRequirement", "dockerPull": docker_image, "dockerImageId": docker_image}
    if file_estimates and disk:
        total_estimate = 0
        for key, multiplier in disk.items():
            if key in file_estimates:
                total_estimate += int(multiplier * file_estimates[key])
        if total_estimate:
            cwl_res["tmpdirMin"] = total_estimate
            cwl_res["outdirMin"] += total_estimate
    out = {"class": "CommandLineTool",
           "cwlVersion": "v1.0",
           "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
           "requirements": [],
           "hints": [docker, cwl_res],
           "arguments": [],
           "inputs": [],
           "outputs": []}
    if programs:
        def resolve_package(p):
            out = {}
            parts = p.split("=")
            if len(parts) == 2:
                out["package"] = parts[0]
                out["version"] = [parts[1]]
            else:
                out["package"] = p
            out["specs"] = ["https://anaconda.org/bioconda/%s" % out["package"]]
            return out
        out["hints"].append({"class": "SoftwareRequirement",
                             "packages": [resolve_package(p) for p in programs]})
    # Use JSON for inputs, rather than command line arguments
    # Correctly handles multiple values and batching across CWL runners
    use_commandline_args = False
    out["requirements"] += [{"class": "InlineJavascriptRequirement"},
                            {"class": "InitialWorkDirRequirement",
                                "listing": [{"entryname": "cwl.inputs.json",
                                            "entry": "$(JSON.stringify(inputs))"}]}]
    out["arguments"] += [{"position": 0, "valueFrom":
                          "sentinel_runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])"},
                         "sentinel_parallel=%s" % parallel,
                         "sentinel_outputs=%s" % ",".join([_get_sentinel_val(v) for v in outputs]),
                         "sentinel_inputs=%s" % ",".join(["%s:%s" %
                                                          (workflow.get_base_id(v["id"]),
                                                           "record" if workflow.is_cwl_record(v) else "var")
                                                          for v in inputs])]
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = base_id
        if inp.get("wf_duplicate"):
            inp_tool["id"] += "_toolinput"
        for attr in ["source", "valueFrom", "wf_duplicate"]:
            inp_tool.pop(attr, None)
        if _is_scatter_parallel(parallel) and _do_scatter_var(inp, parallel):
            inp_tool = workflow._flatten_nested_input(inp_tool)
        if use_commandline_args:
            inp_binding = {"prefix": "%s=" % base_id,
                           "separate": False, "itemSeparator": ";;", "position": i}
            inp_tool = _place_input_binding(inp_tool, inp_binding, parallel)
        else:
            inp_binding = None
        inp_tool = _place_secondary_files(inp_tool, inp_binding)
        out["inputs"].append(inp_tool)
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool["id"] = workflow.get_base_id(outp["id"])
        out["outputs"].append(outp_tool)
    with open(out_file, "w") as out_handle:
        def str_presenter(dumper, data):
            if len(data.splitlines()) > 1:  # check for multiline string
                return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
            return dumper.represent_scalar('tag:yaml.org,2002:str', data)
        yaml.add_representer(str, str_presenter)
        yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #18
0
def _write_tool(step_dir,
                name,
                inputs,
                outputs,
                parallel,
                image,
                programs,
                file_estimates,
                disk,
                step_cores,
                samples,
                cur_remotes,
                no_files,
                container_tags=None):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    resource_cores, mem_gb_per_core = resources.cpu_and_memory(
        (programs or []) + ["default"], samples)
    cores = min([step_cores, resource_cores]) if step_cores else resource_cores
    mem_mb_total = int(mem_gb_per_core * cores * 1024)
    cwl_res = {
        "class": "ResourceRequirement",
        "coresMin": cores,
        "ramMin": mem_mb_total
    }
    disk_hint, input_hint = _get_disk_estimates(name, parallel, inputs,
                                                file_estimates, samples, disk,
                                                cur_remotes, no_files)
    cwl_res.update(disk_hint)
    docker_image = "bcbio/bcbio" if image == "bcbio" else "quay.io/bcbio/%s" % image
    if container_tags is not None:
        docker_image, container_tags = _add_current_quay_tag(
            docker_image, container_tags)
    docker = {
        "class": "DockerRequirement",
        "dockerPull": docker_image,
        "dockerImageId": docker_image
    }
    out = {
        "class": "CommandLineTool",
        "cwlVersion": "v1.0",
        "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
        "requirements": [],
        "hints": [docker, cwl_res, input_hint],
        "arguments": [],
        "inputs": [],
        "outputs": []
    }
    if programs:

        def resolve_package(p):
            out = {}
            parts = p.split("=")
            if len(parts) == 2:
                out["package"] = parts[0]
                out["version"] = [parts[1]]
            else:
                out["package"] = p
            out["specs"] = [
                "https://anaconda.org/bioconda/%s" % out["package"]
            ]
            return out

        out["hints"].append({
            "class": "SoftwareRequirement",
            "packages": [resolve_package(p) for p in programs]
        })
        # GATK requires networking for setting up log4j logging, use arvados extension
        if any(p.startswith(("gatk", "sentieon")) for p in programs):
            out["hints"] += [{"class": "arv:APIRequirement"}]
    # Multi-process methods that read heavily from BAM files need extra keep cache for Arvados
    if name in ["pipeline_summary", "variantcall_batch_region", "detect_sv"]:
        out["hints"] += [{
            "class": "arv:RuntimeConstraints",
            "keep_cache": 4096
        }]

    def add_to_namespaces(k, v, out):
        if "$namespaces" not in out:
            out["$namespaces"] = {}
        out["$namespaces"][k] = v
        return out

    if any(h.get("class", "").startswith("arv:") for h in out["hints"]):
        out = add_to_namespaces("arv", "http://arvados.org/cwl#", out)
    if any(h.get("class", "").startswith("dx") for h in out["hints"]):
        out = add_to_namespaces("dx", "https://www.dnanexus.com/cwl#", out)
    # Use JSON for inputs, rather than command line arguments
    # Correctly handles multiple values and batching across CWL runners
    use_commandline_args = False
    out["requirements"] += [{
        "class": "InlineJavascriptRequirement"
    }, {
        "class":
        "InitialWorkDirRequirement",
        "listing": [{
            "entryname": "cwl.inputs.json",
            "entry": "$(JSON.stringify(inputs))"
        }]
    }]
    out["arguments"] += [{
        "position":
        0,
        "valueFrom":
        "sentinel_runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])"
    },
                         "sentinel_parallel=%s" % parallel,
                         "sentinel_outputs=%s" %
                         ",".join([_get_sentinel_val(v) for v in outputs]),
                         "sentinel_inputs=%s" % ",".join([
                             "%s:%s" %
                             (workflow.get_base_id(v["id"]),
                              "record" if workflow.is_cwl_record(v) else "var")
                             for v in inputs
                         ]), "run_number=0"]
    out = _add_inputs_to_tool(inputs, out, parallel, use_commandline_args)
    out = _add_outputs_to_tool(outputs, out)
    _tool_to_file(out, out_file)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #19
0
def _write_tool(step_dir, name, inputs, outputs, parallel, image, programs,
                file_estimates, disk, step_cores, samples):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    resource_cores, mem_gb_per_core = resources.cpu_and_memory(
        (programs or []) + ["default"], samples)
    cores = step_cores if step_cores else resource_cores
    mem_mb_total = int(mem_gb_per_core * cores * 1024)
    bcbio_docker_disk = 1 * 1024  # Minimum requirements for bcbio Docker image
    cwl_res = {
        "class": "ResourceRequirement",
        "coresMin": cores,
        "ramMin": mem_mb_total,
        "outdirMin": bcbio_docker_disk
    }
    docker_image = "bcbio/bcbio" if image == "bcbio" else "quay.io/bcbio/%s" % image
    docker = {
        "class": "DockerRequirement",
        "dockerPull": docker_image,
        "dockerImageId": docker_image
    }
    if file_estimates and disk:
        total_estimate = 0
        for key, multiplier in disk.items():
            if key in file_estimates:
                total_estimate += int(multiplier * file_estimates[key])
        if total_estimate:
            cwl_res["tmpdirMin"] = total_estimate
            cwl_res["outdirMin"] += total_estimate
    out = {
        "class": "CommandLineTool",
        "cwlVersion": "v1.0",
        "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
        "requirements": [],
        "hints": [docker, cwl_res],
        "arguments": [],
        "inputs": [],
        "outputs": []
    }
    if programs:

        def resolve_package(p):
            out = {}
            parts = p.split("=")
            if len(parts) == 2:
                out["package"] = parts[0]
                out["version"] = [parts[1]]
            else:
                out["package"] = p
            out["specs"] = [
                "https://anaconda.org/bioconda/%s" % out["package"]
            ]
            return out

        out["hints"].append({
            "class": "SoftwareRequirement",
            "packages": [resolve_package(p) for p in programs]
        })
    # Use JSON for inputs, rather than command line arguments
    # Correctly handles multiple values and batching across CWL runners
    use_commandline_args = False
    out["requirements"] += [{
        "class": "InlineJavascriptRequirement"
    }, {
        "class":
        "InitialWorkDirRequirement",
        "listing": [{
            "entryname": "cwl.inputs.json",
            "entry": "$(JSON.stringify(inputs))"
        }]
    }]
    out["arguments"] += [{
        "position":
        0,
        "valueFrom":
        "sentinel_runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])"
    },
                         "sentinel_parallel=%s" % parallel,
                         "sentinel_outputs=%s" %
                         ",".join([_get_sentinel_val(v) for v in outputs]),
                         "sentinel_inputs=%s" % ",".join([
                             "%s:%s" %
                             (workflow.get_base_id(v["id"]),
                              "record" if workflow.is_cwl_record(v) else "var")
                             for v in inputs
                         ])]
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = base_id
        if inp.get("wf_duplicate"):
            inp_tool["id"] += "_toolinput"
        for attr in ["source", "valueFrom", "wf_duplicate"]:
            inp_tool.pop(attr, None)
        if _is_scatter_parallel(parallel) and _do_scatter_var(inp, parallel):
            inp_tool = workflow._flatten_nested_input(inp_tool)
        if use_commandline_args:
            inp_binding = {
                "prefix": "%s=" % base_id,
                "separate": False,
                "itemSeparator": ";;",
                "position": i
            }
            inp_tool = _place_input_binding(inp_tool, inp_binding, parallel)
        else:
            inp_binding = None
        inp_tool = _place_secondary_files(inp_tool, inp_binding)
        inp_tool = _clean_record(inp_tool)
        out["inputs"].append(inp_tool)
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool = _clean_record(outp_tool)
        outp_tool["id"] = workflow.get_base_id(outp["id"])
        out["outputs"].append(outp_tool)
    with open(out_file, "w") as out_handle:

        def str_presenter(dumper, data):
            if len(data.splitlines()) > 1:  # check for multiline string
                return dumper.represent_scalar('tag:yaml.org,2002:str',
                                               data,
                                               style='|')
            return dumper.represent_scalar('tag:yaml.org,2002:str', data)

        yaml.add_representer(str, str_presenter)
        yaml.dump(out,
                  out_handle,
                  default_flow_style=False,
                  allow_unicode=False)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #20
0
def _write_tool(step_dir, name, inputs, outputs, parallel, programs,
                file_estimates, disk, samples):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    cores, mem_gb_per_core = resources.cpu_and_memory(
        (programs or []) + ["default"], samples)
    mem_mb_total = int(mem_gb_per_core * cores * 1024)
    cwl_res = {
        "class": "ResourceRequirement",
        "coresMin": cores,
        "ramMin": mem_mb_total
    }
    if file_estimates and disk:
        total_estimate = 0
        for key, multiplier in disk.items():
            if key in file_estimates:
                total_estimate += int(multiplier * file_estimates[key])
        if total_estimate:
            cwl_res["tmpdirMin"] = total_estimate
    out = {
        "class": "CommandLineTool",
        "cwlVersion": "cwl:draft-3",
        "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
        "hints": [cwl_res],
        "arguments": [],
        "inputs": [],
        "outputs": []
    }
    out["arguments"].append({
        "position": 0,
        "valueFrom": "sentinel-runtime=$(runtime)"
    })
    std_inputs = [{
        "id": "#sentinel-parallel",
        "type": "string",
        "default": parallel
    }, {
        "id":
        "#sentinel-outputs",
        "type":
        "string",
        "default":
        json.dumps([workflow.get_base_id(x["id"]) for x in outputs],
                   sort_keys=True,
                   separators=(',', ':'))
    }]
    inputs = std_inputs + inputs
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = "#%s" % base_id
        for attr in ["source", "valueFrom"]:
            inp_tool.pop(attr, None)
        inp_binding = {
            "prefix": "%s=" % base_id,
            "separate": False,
            "itemSeparator": ";;",
            "position": i
        }
        inp_tool = _place_input_binding(inp_tool, inp_binding, parallel)
        inp_tool = _place_secondary_files(inp_tool, inp_binding)
        out["inputs"].append(inp_tool)
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"])
        out["outputs"].append(outp_tool)
    with open(out_file, "w") as out_handle:

        def str_presenter(dumper, data):
            if len(data.splitlines()) > 1:  # check for multiline string
                return dumper.represent_scalar('tag:yaml.org,2002:str',
                                               data,
                                               style='|')
            return dumper.represent_scalar('tag:yaml.org,2002:str', data)

        yaml.add_representer(str, str_presenter)
        yaml.dump(out,
                  out_handle,
                  default_flow_style=False,
                  allow_unicode=False)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #21
0
def _write_tool(step_dir, name, inputs, outputs, parallel, programs, file_estimates, disk, samples):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    cores, mem_gb_per_core = resources.cpu_and_memory((programs or []) + ["default"], samples)
    mem_mb_total = int(mem_gb_per_core * cores * 1024)
    bcbio_docker_disk = 1 * 1024  # Minimum requirements for bcbio Docker image
    cwl_res = {"class": "ResourceRequirement",
               "coresMin": cores, "ramMin": mem_mb_total, "outdirMin": bcbio_docker_disk}
    if file_estimates and disk:
        total_estimate = 0
        for key, multiplier in disk.items():
            if key in file_estimates:
                total_estimate += int(multiplier * file_estimates[key])
        if total_estimate:
            cwl_res["tmpdirMin"] = total_estimate
            cwl_res["outdirMin"] += total_estimate
    out = {"class": "CommandLineTool",
           "cwlVersion": "v1.0",
           "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
           "hints": [cwl_res],
           "arguments": [],
           "inputs": [],
           "outputs": []}
    if programs:
        def resolve_package(p):
            out = {}
            parts = p.split("=")
            if len(parts) == 2:
                out["package"] = parts[0]
                out["version"] = [parts[1]]
            else:
                out["package"] = p
            out["specs"] = ["https://anaconda.org/bioconda/%s" % out["package"]]
            return out
        out["hints"].append({"class": "SoftwareRequirement",
                             "packages": [resolve_package(p) for p in programs]})
    out["arguments"].append({"position": 0, "valueFrom":
                             "sentinel-runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])"})
    std_inputs = [{"id": "sentinel-parallel", "type": "string",
                   "default": parallel},
                  {"id": "sentinel-outputs", "type": "string",
                   "default": ",".join([workflow.get_base_id(x["id"]) for x in outputs])}]
    inputs = std_inputs + inputs
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = base_id
        for attr in ["source", "valueFrom"]:
            inp_tool.pop(attr, None)
        inp_binding = {"prefix": "%s=" % base_id, "separate": False,
                       "itemSeparator": ";;", "position": i}
        inp_tool = _place_input_binding(inp_tool, inp_binding, parallel)
        inp_tool = _place_secondary_files(inp_tool, inp_binding)
        out["inputs"].append(inp_tool)
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool["id"] = workflow.get_base_id(outp["id"])
        out["outputs"].append(outp_tool)
    with open(out_file, "w") as out_handle:
        def str_presenter(dumper, data):
            if len(data.splitlines()) > 1:  # check for multiline string
                return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
            return dumper.represent_scalar('tag:yaml.org,2002:str', data)
        yaml.add_representer(str, str_presenter)
        yaml.dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #22
0
def _write_tool(step_dir, name, inputs, outputs, parallel, programs, samples):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    cores, mem_gb_per_core = resources.cpu_and_memory(
        programs if programs else ["default"], samples)
    mem_mb_total = int(mem_gb_per_core * cores * 1024)
    out = {
        "class":
        "CommandLineTool",
        "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
        "hints": [{
            "class": "ResourceRequirement",
            "coresMin": cores,
            "ramMin": mem_mb_total
        }],
        "arguments": [],
        "inputs": [],
        "outputs": []
    }
    out["arguments"].append({
        "position": 0,
        "prefix": "sentinel-runtime=",
        "separate": False,
        "valueFrom": "$(JSON.stringify(runtime))"
    })
    std_inputs = [{
        "id": "#sentinel-parallel",
        "type": "string",
        "default": parallel
    }]
    inputs = std_inputs + inputs
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = "#%s" % base_id
        for attr in ["source", "valueFrom"]:
            inp_tool.pop(attr, None)
        inp_binding = {
            "prefix": "%s=" % base_id,
            "separate": False,
            "itemSeparator": ";;",
            "position": i
        }
        inp_tool = _place_input_binding(inp_tool, inp_binding, parallel)
        inp_tool = _place_secondary_files(inp_tool, inp_binding)
        out["inputs"].append(inp_tool)
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool["id"] = "#%s" % workflow.get_base_id(outp["id"])
        out["outputs"].append(outp_tool)
    with open(out_file, "w") as out_handle:

        def str_presenter(dumper, data):
            if len(data.splitlines()) > 1:  # check for multiline string
                return dumper.represent_scalar('tag:yaml.org,2002:str',
                                               data,
                                               style='|')
            return dumper.represent_scalar('tag:yaml.org,2002:str', data)

        yaml.add_representer(str, str_presenter)
        yaml.dump(out,
                  out_handle,
                  default_flow_style=False,
                  allow_unicode=False)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #23
0
def _write_tool(step_dir, name, inputs, outputs, parallel, image, programs,
                file_estimates, disk, step_cores, samples, cur_remotes, no_files,
                container_tags=None):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    resource_cores, mem_gb_per_core = resources.cpu_and_memory((programs or []) + ["default"], samples)
    cores = min([step_cores, resource_cores]) if step_cores else resource_cores
    mem_mb_total = int(mem_gb_per_core * cores * 1024)
    cwl_res = {"class": "ResourceRequirement", "coresMin": cores, "ramMin": mem_mb_total}
    disk_hint, input_hint = _get_disk_estimates(name, parallel, inputs, file_estimates, samples, disk,
                                                cur_remotes, no_files)
    cwl_res.update(disk_hint)
    docker_image = "bcbio/bcbio" if image == "bcbio" else "quay.io/bcbio/%s" % image
    if container_tags is not None:
        docker_image, container_tags = _add_current_quay_tag(docker_image, container_tags)
    docker = {"class": "DockerRequirement", "dockerPull": docker_image, "dockerImageId": docker_image}
    out = {"class": "CommandLineTool",
           "cwlVersion": "v1.0",
           "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
           "requirements": [],
           "hints": [docker, cwl_res, input_hint],
           "arguments": [],
           "inputs": [],
           "outputs": []}
    if programs:
        def resolve_package(p):
            out = {}
            parts = p.split("=")
            if len(parts) == 2:
                out["package"] = parts[0]
                out["version"] = [parts[1]]
            else:
                out["package"] = p
            out["specs"] = ["https://anaconda.org/bioconda/%s" % out["package"]]
            return out
        out["hints"].append({"class": "SoftwareRequirement",
                             "packages": [resolve_package(p) for p in programs]})
        # GATK requires networking for setting up log4j logging, use arvados extension
        if any(p.startswith(("gatk", "sentieon")) for p in programs):
            out["hints"] += [{"class": "arv:APIRequirement"}]
    # Multi-process methods that read heavily from BAM files need extra keep cache for Arvados
    if name in ["pipeline_summary", "variantcall_batch_region", "detect_sv"]:
        out["hints"] += [{"class": "arv:RuntimeConstraints", "keep_cache": 4096}]
    def add_to_namespaces(k, v, out):
        if "$namespaces" not in out:
            out["$namespaces"] = {}
        out["$namespaces"][k] = v
        return out
    if any(h.get("class", "").startswith("arv:") for h in out["hints"]):
        out = add_to_namespaces("arv", "http://arvados.org/cwl#", out)
    if any(h.get("class", "").startswith("dx") for h in out["hints"]):
        out = add_to_namespaces("dx", "https://www.dnanexus.com/cwl#", out)
    # Use JSON for inputs, rather than command line arguments
    # Correctly handles multiple values and batching across CWL runners
    use_commandline_args = False
    out["requirements"] += [{"class": "InlineJavascriptRequirement"},
                            {"class": "InitialWorkDirRequirement",
                                "listing": [{"entryname": "cwl.inputs.json",
                                            "entry": "$(JSON.stringify(inputs))"}]}]
    out["arguments"] += [{"position": 0, "valueFrom":
                          "sentinel_runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])"},
                         "sentinel_parallel=%s" % parallel,
                         "sentinel_outputs=%s" % ",".join([_get_sentinel_val(v) for v in outputs]),
                         "sentinel_inputs=%s" % ",".join(["%s:%s" %
                                                          (workflow.get_base_id(v["id"]),
                                                           "record" if workflow.is_cwl_record(v) else "var")
                                                          for v in inputs]),
                         "run_number=0"]
    out = _add_inputs_to_tool(inputs, out, parallel, use_commandline_args)
    out = _add_outputs_to_tool(outputs, out)
    _tool_to_file(out, out_file)
    return os.path.join("steps", os.path.basename(out_file))
Exemple #24
0
def _write_tool(step_dir, name, inputs, outputs, parallel, image, programs,
                file_estimates, disk, step_cores, samples, cur_remotes):
    out_file = os.path.join(step_dir, "%s.cwl" % name)
    resource_cores, mem_gb_per_core = resources.cpu_and_memory(
        (programs or []) + ["default"], samples)
    cores = min([step_cores, resource_cores]) if step_cores else resource_cores
    mem_mb_total = int(mem_gb_per_core * cores * 1024)
    bcbio_docker_disk = 1 * 1024  # Minimum requirements for bcbio Docker image
    cwl_res = {
        "class": "ResourceRequirement",
        "coresMin": cores,
        "ramMin": mem_mb_total,
        "outdirMin": bcbio_docker_disk
    }
    cwl_res = _add_disk_estimates(cwl_res, inputs, file_estimates, disk)
    docker_image = "bcbio/bcbio" if image == "bcbio" else "quay.io/bcbio/%s" % image
    docker = {
        "class": "DockerRequirement",
        "dockerPull": docker_image,
        "dockerImageId": docker_image
    }
    out = {
        "class": "CommandLineTool",
        "cwlVersion": "v1.0",
        "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"],
        "requirements": [],
        "hints": [docker, cwl_res],
        "arguments": [],
        "inputs": [],
        "outputs": []
    }
    if programs:

        def resolve_package(p):
            out = {}
            parts = p.split("=")
            if len(parts) == 2:
                out["package"] = parts[0]
                out["version"] = [parts[1]]
            else:
                out["package"] = p
            out["specs"] = [
                "https://anaconda.org/bioconda/%s" % out["package"]
            ]
            return out

        out["hints"].append({
            "class": "SoftwareRequirement",
            "packages": [resolve_package(p) for p in programs]
        })
        # GATK requires networking for setting up log4j logging, use arvados extension
        if any(p.startswith(("gatk", "sentieon")) for p in programs):
            out["hints"] += [{"class": "arv:APIRequirement"}]
    # Multi-process methods that read heavily from BAM files need extra keep cache for Arvados
    if name in ["pipeline_summary", "variantcall_batch_region"]:
        out["hints"] += [{
            "class": "arv:RuntimeConstraints",
            "keep_cache": 4096
        }]
    if any(h.get("class", "").startswith("arv:") for h in out["hints"]):
        out["$namespaces"] = {"arv": "http://arvados.org/cwl#"}
    # Use JSON for inputs, rather than command line arguments
    # Correctly handles multiple values and batching across CWL runners
    use_commandline_args = False
    out["requirements"] += [{
        "class": "InlineJavascriptRequirement"
    }, {
        "class":
        "InitialWorkDirRequirement",
        "listing": [{
            "entryname": "cwl.inputs.json",
            "entry": "$(JSON.stringify(inputs))"
        }]
    }]
    out["arguments"] += [{
        "position":
        0,
        "valueFrom":
        "sentinel_runtime=cores,$(runtime['cores']),ram,$(runtime['ram'])"
    },
                         "sentinel_parallel=%s" % parallel,
                         "sentinel_outputs=%s" %
                         ",".join([_get_sentinel_val(v) for v in outputs]),
                         "sentinel_inputs=%s" % ",".join([
                             "%s:%s" %
                             (workflow.get_base_id(v["id"]),
                              "record" if workflow.is_cwl_record(v) else "var")
                             for v in inputs
                         ])]
    for i, inp in enumerate(inputs):
        base_id = workflow.get_base_id(inp["id"])
        inp_tool = copy.deepcopy(inp)
        inp_tool["id"] = base_id
        if inp.get("wf_duplicate"):
            inp_tool["id"] += "_toolinput"
        for attr in ["source", "valueFrom", "wf_duplicate"]:
            inp_tool.pop(attr, None)
        # Ensure records and workflow inputs get scattered
        if (_is_scatter_parallel(parallel) and _do_scatter_var(inp, parallel)
                and (workflow.is_cwl_record(inp) or inp["wf_duplicate"])):
            inp_tool = workflow._flatten_nested_input(inp_tool)
        if use_commandline_args:
            inp_binding = {
                "prefix": "%s=" % base_id,
                "separate": False,
                "itemSeparator": ";;",
                "position": i
            }
            inp_tool = _place_input_binding(inp_tool, inp_binding, parallel)
        else:
            inp_binding = None
        inp_tool = _place_secondary_files(inp_tool, inp_binding)
        inp_tool = _clean_record(inp_tool)
        out["inputs"].append(inp_tool)
    for outp in outputs:
        outp_tool = copy.deepcopy(outp)
        outp_tool = _clean_record(outp_tool)
        outp_tool["id"] = workflow.get_base_id(outp["id"])
        out["outputs"].append(outp_tool)
    with open(out_file, "w") as out_handle:

        def str_presenter(dumper, data):
            if len(data.splitlines()) > 1:  # check for multiline string
                return dumper.represent_scalar('tag:yaml.org,2002:str',
                                               data,
                                               style='|')
            return dumper.represent_scalar('tag:yaml.org,2002:str', data)

        yaml.add_representer(str, str_presenter)
        yaml.dump(out,
                  out_handle,
                  default_flow_style=False,
                  allow_unicode=False)
    return os.path.join("steps", os.path.basename(out_file))