Ejemplo n.º 1
0
    def collect_output(self, schema, builder, outdir):
        r = []
        if "outputBinding" in schema:
            binding = schema["outputBinding"]
            globpatterns = []

            #revmap = partial(revmap_file, builder, outdir)

            if "glob" in binding:
                for gb in aslist(binding["glob"]):
                    gb = builder.do_eval(gb)
                    #_logger.debug("gb evaluated to: '{}'".format(gb))
                    if gb:
                        globpatterns.extend(aslist(gb))

                for gb in globpatterns:
                    g = os.path.join(outdir, gb)
                    cls = "File" if schema["type"] == "File" else "Directory"

                    r.extend([
                        {"location": g,
                         "path": g,
                         "basename": os.path.basename(g),
                         "nameroot": os.path.splitext(os.path.basename(g))[0],
                         "nameext": os.path.splitext(os.path.basename(g))[1],
                         "class": cls}

                    ])
        optional = False
        single = False
        if isinstance(schema["type"], list):
            if "null" in schema["type"]:
                optional = True
            if "File" in schema["type"] or "Directory" in schema["type"]:
                single = True
        elif schema["type"] == "File" or schema["type"] == "Directory":
            single = True
        if "outputEval" in binding:
            r = builder.do_eval(binding["outputEval"], context=r)

        if single:
            if not r and not optional:
                raise WorkflowException("Did not find output file with glob pattern: '{}'".format(globpatterns))
            elif not r and optional:
                pass
            elif isinstance(r,list):
                if len(r) > 1:
                    raise WorkflowException("Multiple matches for output item that is a single file")
                else:
                    r = r[0]

        return r
Ejemplo n.º 2
0
def simplify_list(maybe_list):
    """Turn a length one list loaded by cwltool into a scalar.
    Anything else is passed as-is, by reference."""
    if isinstance(maybe_list, MutableSequence):
        is_list = aslist(maybe_list)
        if len(is_list) == 1:
            return is_list[0]
    return maybe_list
Ejemplo n.º 3
0
def simplify_list(l):
    """Turn a length one list loaded by cwltool into a scalar.
    Anything else is passed as-is, by reference."""
    if isinstance(l, CommentedSeq):
        l = aslist(l)
        if len(l) == 1:
            return l[0]
    return l
Ejemplo n.º 4
0
def upload_job_order(arvrunner, name, tool, job_order):
    """Upload local files referenced in the input object and return updated input
    object with 'location' updated to the proper keep references.
    """

    # Make a copy of the job order and set defaults.
    builder_job_order = copy.copy(job_order)

    # fill_in_defaults throws an error if there are any
    # missing required parameters, we don't want it to do that
    # so make them all optional.
    inputs_copy = copy.deepcopy(tool.tool["inputs"])
    for i in inputs_copy:
        if "null" not in i["type"]:
            i["type"] = ["null"] + aslist(i["type"])

    fill_in_defaults(inputs_copy,
                     builder_job_order,
                     arvrunner.fs_access)
    # Need to create a builder object to evaluate expressions.
    builder = make_builder(builder_job_order,
                           tool.hints,
                           tool.requirements,
                           ArvRuntimeContext(),
                           tool.metadata)
    # Now update job_order with secondaryFiles
    discover_secondary_files(arvrunner.fs_access,
                             builder,
                             tool.tool["inputs"],
                             job_order)

    jobmapper = upload_dependencies(arvrunner,
                                    name,
                                    tool.doc_loader,
                                    job_order,
                                    job_order.get("id", "#"),
                                    False)

    if "id" in job_order:
        del job_order["id"]

    # Need to filter this out, gets added by cwltool when providing
    # parameters on the command line.
    if "job_order" in job_order:
        del job_order["job_order"]

    return job_order
Ejemplo n.º 5
0
    def run(self, dry_run=False, pull_image=True, **kwargs):
        container_request = {
            "command": self.command_line,
            "owner_uuid": self.arvrunner.project_uuid,
            "name": self.name,
            "output_path": self.outdir,
            "cwd": self.outdir,
            "priority": 1,
            "state": "Committed",
            "properties": {},
        }
        runtime_constraints = {}

        resources = self.builder.resources
        if resources is not None:
            runtime_constraints["vcpus"] = resources.get("cores", 1)
            runtime_constraints["ram"] = resources.get("ram") * 2**20

        mounts = {
            self.outdir: {
                "kind": "tmp",
                "capacity": resources.get("outdirSize", 0) * 2**20
            },
            self.tmpdir: {
                "kind": "tmp",
                "capacity": resources.get("tmpdirSize", 0) * 2**20
            }
        }
        scheduling_parameters = {}

        rf = [
            self.pathmapper.mapper(f) for f in self.pathmapper.referenced_files
        ]
        rf.sort(key=lambda k: k.resolved)
        prevdir = None
        for resolved, target, tp, stg in rf:
            if not stg:
                continue
            if prevdir and target.startswith(prevdir):
                continue
            if tp == "Directory":
                targetdir = target
            else:
                targetdir = os.path.dirname(target)
            sp = resolved.split("/", 1)
            pdh = sp[0][5:]  # remove "keep:"
            mounts[targetdir] = {
                "kind": "collection",
                "portable_data_hash": pdh
            }
            if len(sp) == 2:
                if tp == "Directory":
                    path = sp[1]
                else:
                    path = os.path.dirname(sp[1])
                if path and path != "/":
                    mounts[targetdir]["path"] = path
            prevdir = targetdir + "/"

        with Perf(metrics, "generatefiles %s" % self.name):
            if self.generatefiles["listing"]:
                vwd = arvados.collection.Collection(
                    api_client=self.arvrunner.api,
                    keep_client=self.arvrunner.keep_client,
                    num_retries=self.arvrunner.num_retries)
                generatemapper = NoFollowPathMapper([self.generatefiles],
                                                    "",
                                                    "",
                                                    separateDirs=False)

                with Perf(metrics, "createfiles %s" % self.name):
                    for f, p in generatemapper.items():
                        if not p.target:
                            pass
                        elif p.type in ("File", "Directory"):
                            source, path = self.arvrunner.fs_access.get_collection(
                                p.resolved)
                            vwd.copy(path, p.target, source_collection=source)
                        elif p.type == "CreateFile":
                            with vwd.open(p.target, "w") as n:
                                n.write(p.resolved.encode("utf-8"))

                with Perf(metrics, "generatefiles.save_new %s" % self.name):
                    vwd.save_new()

                for f, p in generatemapper.items():
                    if not p.target:
                        continue
                    mountpoint = "%s/%s" % (self.outdir, p.target)
                    mounts[mountpoint] = {
                        "kind": "collection",
                        "portable_data_hash": vwd.portable_data_hash(),
                        "path": p.target
                    }

        container_request["environment"] = {
            "TMPDIR": self.tmpdir,
            "HOME": self.outdir
        }
        if self.environment:
            container_request["environment"].update(self.environment)

        if self.stdin:
            sp = self.stdin[6:].split("/", 1)
            mounts["stdin"] = {
                "kind": "collection",
                "portable_data_hash": sp[0],
                "path": sp[1]
            }

        if self.stderr:
            mounts["stderr"] = {
                "kind": "file",
                "path": "%s/%s" % (self.outdir, self.stderr)
            }

        if self.stdout:
            mounts["stdout"] = {
                "kind": "file",
                "path": "%s/%s" % (self.outdir, self.stdout)
            }

        (docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
        if not docker_req:
            docker_req = {"dockerImageId": "arvados/jobs"}

        container_request["container_image"] = arv_docker_get_image(
            self.arvrunner.api, docker_req, pull_image,
            self.arvrunner.project_uuid)

        api_req, _ = get_feature(self, "http://arvados.org/cwl#APIRequirement")
        if api_req:
            runtime_constraints["API"] = True

        runtime_req, _ = get_feature(
            self, "http://arvados.org/cwl#RuntimeConstraints")
        if runtime_req:
            if "keep_cache" in runtime_req:
                runtime_constraints[
                    "keep_cache_ram"] = runtime_req["keep_cache"] * 2**20
            if "outputDirType" in runtime_req:
                if runtime_req["outputDirType"] == "local_output_dir":
                    # Currently the default behavior.
                    pass
                elif runtime_req["outputDirType"] == "keep_output_dir":
                    mounts[self.outdir] = {
                        "kind": "collection",
                        "writable": True
                    }

        partition_req, _ = get_feature(
            self, "http://arvados.org/cwl#PartitionRequirement")
        if partition_req:
            scheduling_parameters["partitions"] = aslist(
                partition_req["partition"])

        intermediate_output_req, _ = get_feature(
            self, "http://arvados.org/cwl#IntermediateOutput")
        if intermediate_output_req:
            self.output_ttl = intermediate_output_req["outputTTL"]
        else:
            self.output_ttl = self.arvrunner.intermediate_output_ttl

        if self.output_ttl < 0:
            raise WorkflowError(
                "Invalid value %d for output_ttl, cannot be less than zero" %
                container_request["output_ttl"])

        container_request["output_ttl"] = self.output_ttl
        container_request["mounts"] = mounts
        container_request["runtime_constraints"] = runtime_constraints
        container_request["use_existing"] = kwargs.get("enable_reuse", True)
        container_request["scheduling_parameters"] = scheduling_parameters

        if kwargs.get("runnerjob", "").startswith("arvwf:"):
            wfuuid = kwargs["runnerjob"][6:kwargs["runnerjob"].index("#")]
            wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(
                num_retries=self.arvrunner.num_retries)
            if container_request["name"] == "main":
                container_request["name"] = wfrecord["name"]
            container_request["properties"]["template_uuid"] = wfuuid

        try:
            response = self.arvrunner.api.container_requests().create(
                body=container_request).execute(
                    num_retries=self.arvrunner.num_retries)

            self.uuid = response["uuid"]
            self.arvrunner.processes[self.uuid] = self

            if response["state"] == "Final":
                logger.info("%s reused container %s",
                            self.arvrunner.label(self),
                            response["container_uuid"])
                self.done(response)
            else:
                logger.info("%s %s state is %s", self.arvrunner.label(self),
                            response["uuid"], response["state"])
        except Exception as e:
            logger.error("%s got error %s" %
                         (self.arvrunner.label(self), str(e)))
            self.output_callback({}, "permanentFail")
Ejemplo n.º 6
0
    def run(self, fileStore):
        cwljob = resolve_indirect(self.cwljob)

        # `promises` dict
        # from: each parameter (workflow input or step output)
        #   that may be used as a "source" for a step input workflow output
        #   parameter
        # to: the job that will produce that value.
        promises = {}

        # `jobs` dict from step id to job that implements that step.
        jobs = {}

        for inp in self.cwlwf.tool["inputs"]:
            promises[inp["id"]] = SelfJob(self, cwljob)

        alloutputs_fufilled = False
        while not alloutputs_fufilled:
            # Iteratively go over the workflow steps, scheduling jobs as their
            # dependencies can be fufilled by upstream workflow inputs or
            # step outputs.  Loop exits when the workflow outputs
            # are satisfied.

            alloutputs_fufilled = True

            for step in self.cwlwf.steps:
                if step.tool["id"] not in jobs:
                    stepinputs_fufilled = True
                    for inp in step.tool["inputs"]:
                        if "source" in inp:
                            for s in aslist(inp["source"]):
                                if s not in promises:
                                    stepinputs_fufilled = False
                    if stepinputs_fufilled:
                        jobobj = {}

                        for inp in step.tool["inputs"]:
                            key = shortname(inp["id"])
                            if "source" in inp:
                                if inp.get("linkMerge") or len(aslist(inp["source"])) > 1:
                                    linkMerge = inp.get("linkMerge", "merge_nested")
                                    if linkMerge == "merge_nested":
                                        jobobj[key] = (
                                            MergeInputsNested([(shortname(s), promises[s].rv())
                                                               for s in aslist(inp["source"])]))
                                    elif linkMerge == "merge_flattened":
                                        jobobj[key] = (
                                            MergeInputsFlattened([(shortname(s), promises[s].rv())
                                                                  for s in aslist(inp["source"])]))
                                    else:
                                        raise validate.ValidationException(
                                            "Unsupported linkMerge '%s'", linkMerge)
                                else:
                                    jobobj[key] = (shortname(inp["source"]),
                                                   promises[inp["source"]].rv())
                            elif "default" in inp:
                                d = copy.copy(inp["default"])
                                jobobj[key] = ("default", {"default": d})

                            if "valueFrom" in inp and "scatter" not in step.tool:
                                if key in jobobj:
                                    jobobj[key] = StepValueFrom(inp["valueFrom"],
                                                                jobobj[key],
                                                                self.cwlwf.requirements)
                                else:
                                    jobobj[key] = StepValueFrom(inp["valueFrom"],
                                                                ("None", {"None": None}),
                                                                self.cwlwf.requirements)

                        if "scatter" in step.tool:
                            wfjob = CWLScatter(step, IndirectDict(jobobj), **self.executor_options)
                            followOn = CWLGather(step, wfjob.rv())
                            wfjob.addFollowOn(followOn)
                        else:
                            (wfjob, followOn) = makeJob(step.embedded_tool, IndirectDict(jobobj),
                                                        step_inputs=step.tool["inputs"],
                                                        **self.executor_options)

                        jobs[step.tool["id"]] = followOn

                        connected = False
                        for inp in step.tool["inputs"]:
                            for s in aslist(inp.get("source", [])):
                                if not promises[s].hasChild(wfjob):
                                    promises[s].addChild(wfjob)
                                    connected = True
                        if not connected:
                            # workflow step has default inputs only, isn't connected to other jobs,
                            # so add it as child of workflow.
                            self.addChild(wfjob)

                        for out in step.tool["outputs"]:
                            promises[out["id"]] = followOn

                for inp in step.tool["inputs"]:
                    for s in aslist(inp.get("source", [])):
                        if s not in promises:
                            alloutputs_fufilled = False

            # may need a test
            for out in self.cwlwf.tool["outputs"]:
                if "source" in out:
                    if out["source"] not in promises:
                        alloutputs_fufilled = False

        outobj = {}
        for out in self.cwlwf.tool["outputs"]:
            outobj[shortname(out["id"])] = (shortname(out["outputSource"]), promises[out["outputSource"]].rv())

        return IndirectDict(outobj)
Ejemplo n.º 7
0
    def run(self, runtimeContext):
        # ArvadosCommandTool subclasses from cwltool.CommandLineTool,
        # which calls makeJobRunner() to get a new ArvadosContainer
        # object.  The fields that define execution such as
        # command_line, environment, etc are set on the
        # ArvadosContainer object by CommandLineTool.job() before
        # run() is called.

        runtimeContext = self.job_runtime

        container_request = {
            "command": self.command_line,
            "name": self.name,
            "output_path": self.outdir,
            "cwd": self.outdir,
            "priority": runtimeContext.priority,
            "state": "Committed",
            "properties": {},
        }
        runtime_constraints = {}

        if runtimeContext.project_uuid:
            container_request["owner_uuid"] = runtimeContext.project_uuid

        if self.arvrunner.secret_store.has_secret(self.command_line):
            raise WorkflowException(
                "Secret material leaked on command line, only file literals may contain secrets"
            )

        if self.arvrunner.secret_store.has_secret(self.environment):
            raise WorkflowException(
                "Secret material leaked in environment, only file literals may contain secrets"
            )

        resources = self.builder.resources
        if resources is not None:
            runtime_constraints["vcpus"] = math.ceil(resources.get("cores", 1))
            runtime_constraints["ram"] = math.ceil(
                resources.get("ram") * 2**20)

        mounts = {
            self.outdir: {
                "kind": "tmp",
                "capacity": math.ceil(resources.get("outdirSize", 0) * 2**20)
            },
            self.tmpdir: {
                "kind": "tmp",
                "capacity": math.ceil(resources.get("tmpdirSize", 0) * 2**20)
            }
        }
        secret_mounts = {}
        scheduling_parameters = {}

        rf = [
            self.pathmapper.mapper(f) for f in self.pathmapper.referenced_files
        ]
        rf.sort(key=lambda k: k.resolved)
        prevdir = None
        for resolved, target, tp, stg in rf:
            if not stg:
                continue
            if prevdir and target.startswith(prevdir):
                continue
            if tp == "Directory":
                targetdir = target
            else:
                targetdir = os.path.dirname(target)
            sp = resolved.split("/", 1)
            pdh = sp[0][5:]  # remove "keep:"
            mounts[targetdir] = {
                "kind": "collection",
                "portable_data_hash": pdh
            }
            if pdh in self.pathmapper.pdh_to_uuid:
                mounts[targetdir]["uuid"] = self.pathmapper.pdh_to_uuid[pdh]
            if len(sp) == 2:
                if tp == "Directory":
                    path = sp[1]
                else:
                    path = os.path.dirname(sp[1])
                if path and path != "/":
                    mounts[targetdir]["path"] = path
            prevdir = targetdir + "/"

        with Perf(metrics, "generatefiles %s" % self.name):
            if self.generatefiles["listing"]:
                vwd = arvados.collection.Collection(
                    api_client=self.arvrunner.api,
                    keep_client=self.arvrunner.keep_client,
                    num_retries=self.arvrunner.num_retries)
                generatemapper = NoFollowPathMapper(
                    self.generatefiles["listing"], "", "", separateDirs=False)

                sorteditems = sorted(generatemapper.items(),
                                     key=lambda n: n[1].target)

                logger.debug("generatemapper is %s", sorteditems)

                with Perf(metrics, "createfiles %s" % self.name):
                    for f, p in sorteditems:
                        if not p.target:
                            continue

                        if p.target.startswith("/"):
                            dst = p.target[len(self.outdir) +
                                           1:] if p.target.startswith(
                                               self.outdir +
                                               "/") else p.target[1:]
                        else:
                            dst = p.target

                        if p.type in ("File", "Directory", "WritableFile",
                                      "WritableDirectory"):
                            if p.resolved.startswith("_:"):
                                vwd.mkdirs(dst)
                            else:
                                source, path = self.arvrunner.fs_access.get_collection(
                                    p.resolved)
                                vwd.copy(path or ".",
                                         dst,
                                         source_collection=source)
                        elif p.type == "CreateFile":
                            if self.arvrunner.secret_store.has_secret(
                                    p.resolved):
                                mountpoint = p.target if p.target.startswith(
                                    "/") else os.path.join(
                                        self.outdir, p.target)
                                secret_mounts[mountpoint] = {
                                    "kind":
                                    "text",
                                    "content":
                                    self.arvrunner.secret_store.retrieve(
                                        p.resolved)
                                }
                            else:
                                with vwd.open(dst, "w") as n:
                                    n.write(p.resolved)

                def keepemptydirs(p):
                    if isinstance(p, arvados.collection.RichCollectionBase):
                        if len(p) == 0:
                            p.open(".keep", "w").close()
                        else:
                            for c in p:
                                keepemptydirs(p[c])

                keepemptydirs(vwd)

                if not runtimeContext.current_container:
                    runtimeContext.current_container = arvados_cwl.util.get_current_container(
                        self.arvrunner.api, self.arvrunner.num_retries, logger)
                info = arvados_cwl.util.get_intermediate_collection_info(
                    self.name, runtimeContext.current_container,
                    runtimeContext.intermediate_output_ttl)
                vwd.save_new(name=info["name"],
                             owner_uuid=runtimeContext.project_uuid,
                             ensure_unique_name=True,
                             trash_at=info["trash_at"],
                             properties=info["properties"])

                prev = None
                for f, p in sorteditems:
                    if (not p.target or self.arvrunner.secret_store.has_secret(
                            p.resolved) or
                        (prev is not None and p.target.startswith(prev))):
                        continue
                    if p.target.startswith("/"):
                        dst = p.target[len(self.outdir) +
                                       1:] if p.target.startswith(
                                           self.outdir + "/") else p.target[1:]
                    else:
                        dst = p.target
                    mountpoint = p.target if p.target.startswith(
                        "/") else os.path.join(self.outdir, p.target)
                    mounts[mountpoint] = {
                        "kind": "collection",
                        "portable_data_hash": vwd.portable_data_hash(),
                        "path": dst
                    }
                    if p.type.startswith("Writable"):
                        mounts[mountpoint]["writable"] = True
                    prev = p.target + "/"

        container_request["environment"] = {
            "TMPDIR": self.tmpdir,
            "HOME": self.outdir
        }
        if self.environment:
            container_request["environment"].update(self.environment)

        if self.stdin:
            sp = self.stdin[6:].split("/", 1)
            mounts["stdin"] = {
                "kind": "collection",
                "portable_data_hash": sp[0],
                "path": sp[1]
            }

        if self.stderr:
            mounts["stderr"] = {
                "kind": "file",
                "path": "%s/%s" % (self.outdir, self.stderr)
            }

        if self.stdout:
            mounts["stdout"] = {
                "kind": "file",
                "path": "%s/%s" % (self.outdir, self.stdout)
            }

        (docker_req, docker_is_req) = self.get_requirement("DockerRequirement")
        if not docker_req:
            docker_req = {"dockerImageId": "arvados/jobs:" + __version__}

        container_request["container_image"] = arv_docker_get_image(
            self.arvrunner.api, docker_req, runtimeContext.pull_image,
            runtimeContext.project_uuid, runtimeContext.force_docker_pull,
            runtimeContext.tmp_outdir_prefix)

        network_req, _ = self.get_requirement("NetworkAccess")
        if network_req:
            runtime_constraints["API"] = network_req["networkAccess"]

        api_req, _ = self.get_requirement(
            "http://arvados.org/cwl#APIRequirement")
        if api_req:
            runtime_constraints["API"] = True

        runtime_req, _ = self.get_requirement(
            "http://arvados.org/cwl#RuntimeConstraints")
        if runtime_req:
            if "keep_cache" in runtime_req:
                runtime_constraints["keep_cache_ram"] = math.ceil(
                    runtime_req["keep_cache"] * 2**20)
            if "outputDirType" in runtime_req:
                if runtime_req["outputDirType"] == "local_output_dir":
                    # Currently the default behavior.
                    pass
                elif runtime_req["outputDirType"] == "keep_output_dir":
                    mounts[self.outdir] = {
                        "kind": "collection",
                        "writable": True
                    }

        partition_req, _ = self.get_requirement(
            "http://arvados.org/cwl#PartitionRequirement")
        if partition_req:
            scheduling_parameters["partitions"] = aslist(
                partition_req["partition"])

        intermediate_output_req, _ = self.get_requirement(
            "http://arvados.org/cwl#IntermediateOutput")
        if intermediate_output_req:
            self.output_ttl = intermediate_output_req["outputTTL"]
        else:
            self.output_ttl = self.arvrunner.intermediate_output_ttl

        if self.output_ttl < 0:
            raise WorkflowException(
                "Invalid value %d for output_ttl, cannot be less than zero" %
                container_request["output_ttl"])

        if self.timelimit is not None and self.timelimit > 0:
            scheduling_parameters["max_run_time"] = self.timelimit

        extra_submit_params = {}
        if runtimeContext.submit_runner_cluster:
            extra_submit_params[
                "cluster_id"] = runtimeContext.submit_runner_cluster

        container_request["output_name"] = "Output for step %s" % (self.name)
        container_request["output_ttl"] = self.output_ttl
        container_request["mounts"] = mounts
        container_request["secret_mounts"] = secret_mounts
        container_request["runtime_constraints"] = runtime_constraints
        container_request["scheduling_parameters"] = scheduling_parameters

        enable_reuse = runtimeContext.enable_reuse
        if enable_reuse:
            reuse_req, _ = self.get_requirement("WorkReuse")
            if reuse_req:
                enable_reuse = reuse_req["enableReuse"]
            reuse_req, _ = self.get_requirement(
                "http://arvados.org/cwl#ReuseRequirement")
            if reuse_req:
                enable_reuse = reuse_req["enableReuse"]
        container_request["use_existing"] = enable_reuse

        if runtimeContext.runnerjob.startswith("arvwf:"):
            wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.
                                              index("#")]
            wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(
                num_retries=self.arvrunner.num_retries)
            if container_request["name"] == "main":
                container_request["name"] = wfrecord["name"]
            container_request["properties"]["template_uuid"] = wfuuid

        self.output_callback = self.arvrunner.get_wrapped_callback(
            self.output_callback)

        try:
            if runtimeContext.submit_request_uuid:
                response = self.arvrunner.api.container_requests().update(
                    uuid=runtimeContext.submit_request_uuid,
                    body=container_request,
                    **extra_submit_params).execute(
                        num_retries=self.arvrunner.num_retries)
            else:
                response = self.arvrunner.api.container_requests().create(
                    body=container_request, **extra_submit_params).execute(
                        num_retries=self.arvrunner.num_retries)

            self.uuid = response["uuid"]
            self.arvrunner.process_submitted(self)

            if response["state"] == "Final":
                logger.info("%s reused container %s",
                            self.arvrunner.label(self),
                            response["container_uuid"])
            else:
                logger.info("%s %s state is %s", self.arvrunner.label(self),
                            response["uuid"], response["state"])
        except Exception as e:
            logger.exception("%s error submitting container\n%s",
                             self.arvrunner.label(self), e)
            logger.debug("Container request was %s", container_request)
            self.output_callback({}, "permanentFail")
Ejemplo n.º 8
0
    def run(self, file_store):
        cwljob = resolve_indirect(self.cwljob)

        # `promises` dict
        # from: each parameter (workflow input or step output)
        #   that may be used as a "source" for a step input workflow output
        #   parameter
        # to: the job that will produce that value.
        promises = {}

        # `jobs` dict from step id to job that implements that step.
        jobs = {}

        for inp in self.cwlwf.tool["inputs"]:
            promises[inp["id"]] = SelfJob(self, cwljob)

        alloutputs_fufilled = False
        while not alloutputs_fufilled:
            # Iteratively go over the workflow steps, scheduling jobs as their
            # dependencies can be fufilled by upstream workflow inputs or
            # step outputs.  Loop exits when the workflow outputs
            # are satisfied.

            alloutputs_fufilled = True

            for step in self.cwlwf.steps:
                if step.tool["id"] not in jobs:
                    stepinputs_fufilled = True
                    for inp in step.tool["inputs"]:
                        if "source" in inp:
                            for s in aslist(inp["source"]):
                                if s not in promises:
                                    stepinputs_fufilled = False
                    if stepinputs_fufilled:
                        jobobj = {}

                        for inp in step.tool["inputs"]:
                            key = shortname(inp["id"])
                            if "source" in inp:
                                if inp.get("linkMerge") \
                                        or len(aslist(inp["source"])) > 1:
                                    linkMerge = inp.get(
                                        "linkMerge", "merge_nested")
                                    if linkMerge == "merge_nested":
                                        jobobj[key] = (
                                            MergeInputsNested(
                                                [(shortname(s),
                                                  promises[s].rv())
                                                 for s in aslist(
                                                     inp["source"])]))
                                    elif linkMerge == "merge_flattened":
                                        jobobj[key] = (
                                            MergeInputsFlattened(
                                                [(shortname(s),
                                                  promises[s].rv())
                                                 for s in aslist(
                                                      inp["source"])]))
                                    else:
                                        raise validate.ValidationException(
                                            "Unsupported linkMerge '%s'" %
                                            linkMerge)
                                else:
                                    inpSource = inp["source"]
                                    if isinstance(inpSource, MutableSequence):
                                        # It seems that an input source with a
                                        # '#' in the name will be returned as a
                                        # CommentedSeq list by the yaml parser.
                                        inpSource = str(inpSource[0])
                                    jobobj[key] = (shortname(inpSource),
                                                   promises[inpSource].rv())
                            if "default" in inp:
                                if key in jobobj:
                                    if isinstance(jobobj[key][1], Promise):
                                        d = copy.copy(inp["default"])
                                        jobobj[key] = DefaultWithSource(
                                            d, jobobj[key])
                                    else:
                                        if jobobj[key][1][
                                                jobobj[key][0]] is None:
                                            d = copy.copy(inp["default"])
                                            jobobj[key] = (
                                                "default", {"default": d})
                                else:
                                    d = copy.copy(inp["default"])
                                    jobobj[key] = ("default", {"default": d})

                            if "valueFrom" in inp \
                                    and "scatter" not in step.tool:
                                if key in jobobj:
                                    jobobj[key] = StepValueFrom(
                                        inp["valueFrom"], jobobj[key],
                                        self.cwlwf.requirements)
                                else:
                                    jobobj[key] = StepValueFrom(
                                        inp["valueFrom"], (
                                            "None", {"None": None}),
                                        self.cwlwf.requirements)

                        if "scatter" in step.tool:
                            wfjob = CWLScatter(step, IndirectDict(jobobj),
                                               self.runtime_context)
                            followOn = CWLGather(step, wfjob.rv())
                            wfjob.addFollowOn(followOn)
                        else:
                            (wfjob, followOn) = makeJob(
                                step.embedded_tool, IndirectDict(jobobj),
                                step.tool["inputs"],
                                self.runtime_context)

                        jobs[step.tool["id"]] = followOn

                        connected = False
                        for inp in step.tool["inputs"]:
                            for s in aslist(inp.get("source", [])):
                                if (isinstance(
                                        promises[s], (CWLJobWrapper, CWLGather)
                                              ) and
                                        not promises[s].hasFollowOn(wfjob)):
                                    promises[s].addFollowOn(wfjob)
                                    connected = True
                                if (not isinstance(
                                        promises[s], (CWLJobWrapper, CWLGather)
                                                  ) and
                                        not promises[s].hasChild(wfjob)):
                                    promises[s].addChild(wfjob)
                                    connected = True
                        if not connected:
                            # the workflow step has default inputs only & isn't
                            # connected to other jobs, so add it as child of
                            # this workflow.
                            self.addChild(wfjob)

                        for out in step.tool["outputs"]:
                            promises[out["id"]] = followOn

                for inp in step.tool["inputs"]:
                    for source in aslist(inp.get("source", [])):
                        if source not in promises:
                            alloutputs_fufilled = False

            # may need a test
            for out in self.cwlwf.tool["outputs"]:
                if "source" in out:
                    if out["source"] not in promises:
                        alloutputs_fufilled = False

        outobj = {}
        for out in self.cwlwf.tool["outputs"]:
            key = shortname(out["id"])
            if out.get("linkMerge") or len(aslist(out["outputSource"])) > 1:
                link_merge = out.get("linkMerge", "merge_nested")
                if link_merge == "merge_nested":
                    outobj[key] = (
                        MergeInputsNested(
                            [(shortname(s), promises[s].rv())
                             for s in aslist(out["outputSource"])]))
                elif link_merge == "merge_flattened":
                    outobj[key] = (
                        MergeInputsFlattened([
                            (shortname(s), promises[s].rv())
                            for s in aslist(out["source"])]))
                else:
                    raise validate.ValidationException(
                        "Unsupported linkMerge '{}'".format(link_merge))

            else:
                # A CommentedSeq of length one still appears here rarely -
                # not clear why from the CWL code. When it does, it breaks
                # the execution by causing a non-hashable type exception.
                # We simplify the list into its first (and only) element.
                src = simplify_list(out["outputSource"])
                outobj[key] = (shortname(src), promises[src].rv())

        return IndirectDict(outobj)
Ejemplo n.º 9
0
    def run(self, dry_run=False, pull_image=True, **kwargs):
        container_request = {
            "command": self.command_line,
            "owner_uuid": self.arvrunner.project_uuid,
            "name": self.name,
            "output_path": self.outdir,
            "cwd": self.outdir,
            "priority": 1,
            "state": "Committed",
            "properties": {}
        }
        runtime_constraints = {}

        resources = self.builder.resources
        if resources is not None:
            runtime_constraints["vcpus"] = resources.get("cores", 1)
            runtime_constraints["ram"] = resources.get("ram") * 2**20

        mounts = {
            self.outdir: {
                "kind": "tmp",
                "capacity": resources.get("outdirSize", 0) * 2**20
            },
            self.tmpdir: {
                "kind": "tmp",
                "capacity": resources.get("tmpdirSize", 0) * 2**20
            }
        }
        scheduling_parameters = {}

        rf = [self.pathmapper.mapper(f) for f in self.pathmapper.referenced_files]
        rf.sort(key=lambda k: k.resolved)
        prevdir = None
        for resolved, target, tp, stg in rf:
            if not stg:
                continue
            if prevdir and target.startswith(prevdir):
                continue
            if tp == "Directory":
                targetdir = target
            else:
                targetdir = os.path.dirname(target)
            sp = resolved.split("/", 1)
            pdh = sp[0][5:]   # remove "keep:"
            mounts[targetdir] = {
                "kind": "collection",
                "portable_data_hash": pdh
            }
            if len(sp) == 2:
                if tp == "Directory":
                    path = sp[1]
                else:
                    path = os.path.dirname(sp[1])
                if path and path != "/":
                    mounts[targetdir]["path"] = path
            prevdir = targetdir + "/"

        with Perf(metrics, "generatefiles %s" % self.name):
            if self.generatefiles["listing"]:
                vwd = arvados.collection.Collection(api_client=self.arvrunner.api,
                                                    keep_client=self.arvrunner.keep_client,
                                                    num_retries=self.arvrunner.num_retries)
                generatemapper = NoFollowPathMapper([self.generatefiles], "", "",
                                                    separateDirs=False)

                with Perf(metrics, "createfiles %s" % self.name):
                    for f, p in generatemapper.items():
                        if not p.target:
                            pass
                        elif p.type in ("File", "Directory"):
                            source, path = self.arvrunner.fs_access.get_collection(p.resolved)
                            vwd.copy(path, p.target, source_collection=source)
                        elif p.type == "CreateFile":
                            with vwd.open(p.target, "w") as n:
                                n.write(p.resolved.encode("utf-8"))

                with Perf(metrics, "generatefiles.save_new %s" % self.name):
                    vwd.save_new()

                for f, p in generatemapper.items():
                    if not p.target:
                        continue
                    mountpoint = "%s/%s" % (self.outdir, p.target)
                    mounts[mountpoint] = {"kind": "collection",
                                          "portable_data_hash": vwd.portable_data_hash(),
                                          "path": p.target}

        container_request["environment"] = {"TMPDIR": self.tmpdir, "HOME": self.outdir}
        if self.environment:
            container_request["environment"].update(self.environment)

        if self.stdin:
            sp = self.stdin[6:].split("/", 1)
            mounts["stdin"] = {"kind": "collection",
                                "portable_data_hash": sp[0],
                                "path": sp[1]}

        if self.stderr:
            mounts["stderr"] = {"kind": "file",
                                "path": "%s/%s" % (self.outdir, self.stderr)}

        if self.stdout:
            mounts["stdout"] = {"kind": "file",
                                "path": "%s/%s" % (self.outdir, self.stdout)}

        (docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
        if not docker_req:
            docker_req = {"dockerImageId": "arvados/jobs"}

        container_request["container_image"] = arv_docker_get_image(self.arvrunner.api,
                                                                     docker_req,
                                                                     pull_image,
                                                                     self.arvrunner.project_uuid)

        api_req, _ = get_feature(self, "http://arvados.org/cwl#APIRequirement")
        if api_req:
            runtime_constraints["API"] = True

        runtime_req, _ = get_feature(self, "http://arvados.org/cwl#RuntimeConstraints")
        if runtime_req:
            if "keep_cache" in runtime_req:
                runtime_constraints["keep_cache_ram"] = runtime_req["keep_cache"] * 2**20
            if "outputDirType" in runtime_req:
                if runtime_req["outputDirType"] == "local_output_dir":
                    # Currently the default behavior.
                    pass
                elif runtime_req["outputDirType"] == "keep_output_dir":
                    mounts[self.outdir]= {
                        "kind": "collection",
                        "writable": True
                    }

        partition_req, _ = get_feature(self, "http://arvados.org/cwl#PartitionRequirement")
        if partition_req:
            scheduling_parameters["partitions"] = aslist(partition_req["partition"])

        container_request["mounts"] = mounts
        container_request["runtime_constraints"] = runtime_constraints
        container_request["use_existing"] = kwargs.get("enable_reuse", True)
        container_request["scheduling_parameters"] = scheduling_parameters

        if kwargs.get("runnerjob", "").startswith("arvwf:"):
            wfuuid = kwargs["runnerjob"][6:kwargs["runnerjob"].index("#")]
            wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(num_retries=self.arvrunner.num_retries)
            if container_request["name"] == "main":
                container_request["name"] = wfrecord["name"]
            container_request["properties"]["template_uuid"] = wfuuid

        try:
            response = self.arvrunner.api.container_requests().create(
                body=container_request
            ).execute(num_retries=self.arvrunner.num_retries)

            self.uuid = response["uuid"]
            self.arvrunner.processes[self.uuid] = self

            if response["state"] == "Final":
                logger.info("%s reused container %s", self.arvrunner.label(self), response["container_uuid"])
                self.done(response)
            else:
                logger.info("%s %s state is %s", self.arvrunner.label(self), response["uuid"], response["state"])
        except Exception as e:
            logger.error("%s got error %s" % (self.arvrunner.label(self), str(e)))
            self.output_callback({}, "permanentFail")
Ejemplo n.º 10
0
    def run(self, dry_run=False, pull_image=True, **kwargs):
        container_request = {
            "command": self.command_line,
            "owner_uuid": self.arvrunner.project_uuid,
            "name": self.name,
            "output_path": self.outdir,
            "cwd": self.outdir,
            "priority": 1,
            "state": "Committed",
            "properties": {}
        }
        runtime_constraints = {}
        mounts = {self.outdir: {"kind": "tmp"}}
        scheduling_parameters = {}

        dirs = set()
        for f in self.pathmapper.files():
            _, p, tp = self.pathmapper.mapper(f)
            if tp == "Directory" and '/' not in p[6:]:
                mounts[p] = {"kind": "collection", "portable_data_hash": p[6:]}
                dirs.add(p[6:])
        for f in self.pathmapper.files():
            _, p, tp = self.pathmapper.mapper(f)
            if p[6:].split("/")[0] not in dirs:
                mounts[p] = {"kind": "collection", "portable_data_hash": p[6:]}

        if self.generatefiles["listing"]:
            raise UnsupportedRequirement(
                "InitialWorkDirRequirement not supported with --api=containers"
            )

        container_request["environment"] = {
            "TMPDIR": self.tmpdir,
            "HOME": self.outdir
        }
        if self.environment:
            container_request["environment"].update(self.environment)

        if self.stdin:
            raise UnsupportedRequirement(
                "Stdin redirection currently not suppported")

        if self.stderr:
            raise UnsupportedRequirement(
                "Stderr redirection currently not suppported")

        if self.stdout:
            mounts["stdout"] = {
                "kind": "file",
                "path": "%s/%s" % (self.outdir, self.stdout)
            }

        (docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
        if not docker_req:
            docker_req = {"dockerImageId": arvados_jobs_image(self.arvrunner)}

        container_request["container_image"] = arv_docker_get_image(
            self.arvrunner.api, docker_req, pull_image,
            self.arvrunner.project_uuid)

        resources = self.builder.resources
        if resources is not None:
            runtime_constraints["vcpus"] = resources.get("cores", 1)
            runtime_constraints["ram"] = resources.get("ram") * 2**20

        api_req, _ = get_feature(self, "http://arvados.org/cwl#APIRequirement")
        if api_req:
            runtime_constraints["API"] = True

        runtime_req, _ = get_feature(
            self, "http://arvados.org/cwl#RuntimeConstraints")
        if runtime_req:
            if "keep_cache" in runtime_req:
                runtime_constraints["keep_cache_ram"] = runtime_req[
                    "keep_cache"]

        partition_req, _ = get_feature(
            self, "http://arvados.org/cwl#PartitionRequirement")
        if partition_req:
            scheduling_parameters["partitions"] = aslist(
                partition_req["partition"])

        container_request["mounts"] = mounts
        container_request["runtime_constraints"] = runtime_constraints
        container_request["use_existing"] = kwargs.get("enable_reuse", True)
        container_request["scheduling_parameters"] = scheduling_parameters

        if kwargs.get("runnerjob", "").startswith("arvwf:"):
            wfuuid = kwargs["runnerjob"][6:kwargs["runnerjob"].index("#")]
            wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(
                num_retries=self.arvrunner.num_retries)
            if container_request["name"] == "main":
                container_request["name"] = wfrecord["name"]
            container_request["properties"]["template_uuid"] = wfuuid

        try:
            response = self.arvrunner.api.container_requests().create(
                body=container_request).execute(
                    num_retries=self.arvrunner.num_retries)

            self.uuid = response["uuid"]
            self.arvrunner.processes[self.uuid] = self

            logger.info("Container request %s (%s) state is %s", self.name,
                        response["uuid"], response["state"])

            if response["state"] == "Final":
                self.done(response)
        except Exception as e:
            logger.error("Got error %s" % str(e))
            self.output_callback({}, "permanentFail")
Ejemplo n.º 11
0
    def run(self, dry_run=False, pull_image=True, **kwargs):
        container_request = {
            "command": self.command_line,
            "owner_uuid": self.arvrunner.project_uuid,
            "name": self.name,
            "output_path": self.outdir,
            "cwd": self.outdir,
            "priority": 1,
            "state": "Committed",
            "properties": {}
        }
        runtime_constraints = {}
        mounts = {self.outdir: {"kind": "tmp"}}
        scheduling_parameters = {}

        dirs = set()
        for f in self.pathmapper.files():
            pdh, p, tp = self.pathmapper.mapper(f)
            if tp == "Directory" and '/' not in pdh:
                mounts[p] = {
                    "kind": "collection",
                    "portable_data_hash": pdh[5:]
                }
                dirs.add(pdh)

        for f in self.pathmapper.files():
            res, p, tp = self.pathmapper.mapper(f)
            if res.startswith("keep:"):
                res = res[5:]
            elif res.startswith("/keep/"):
                res = res[6:]
            else:
                continue
            sp = res.split("/", 1)
            pdh = sp[0]
            if pdh not in dirs:
                mounts[p] = {"kind": "collection", "portable_data_hash": pdh}
                if len(sp) == 2:
                    mounts[p]["path"] = sp[1]

        with Perf(metrics, "generatefiles %s" % self.name):
            if self.generatefiles["listing"]:
                vwd = arvados.collection.Collection(
                    api_client=self.arvrunner.api,
                    keep_client=self.arvrunner.keep_client,
                    num_retries=self.arvrunner.num_retries)
                generatemapper = NoFollowPathMapper([self.generatefiles],
                                                    "",
                                                    "",
                                                    separateDirs=False)

                with Perf(metrics, "createfiles %s" % self.name):
                    for f, p in generatemapper.items():
                        if not p.target:
                            pass
                        elif p.type in ("File", "Directory"):
                            source, path = self.arvrunner.fs_access.get_collection(
                                p.resolved)
                            vwd.copy(path, p.target, source_collection=source)
                        elif p.type == "CreateFile":
                            with vwd.open(p.target, "w") as n:
                                n.write(p.resolved.encode("utf-8"))

                with Perf(metrics, "generatefiles.save_new %s" % self.name):
                    vwd.save_new()

                for f, p in generatemapper.items():
                    if not p.target:
                        continue
                    mountpoint = "%s/%s" % (self.outdir, p.target)
                    mounts[mountpoint] = {
                        "kind": "collection",
                        "portable_data_hash": vwd.portable_data_hash(),
                        "path": p.target
                    }

        container_request["environment"] = {
            "TMPDIR": self.tmpdir,
            "HOME": self.outdir
        }
        if self.environment:
            container_request["environment"].update(self.environment)

        if self.stdin:
            raise UnsupportedRequirement(
                "Stdin redirection currently not suppported")

        if self.stderr:
            raise UnsupportedRequirement(
                "Stderr redirection currently not suppported")

        if self.stdout:
            mounts["stdout"] = {
                "kind": "file",
                "path": "%s/%s" % (self.outdir, self.stdout)
            }

        (docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
        if not docker_req:
            docker_req = {"dockerImageId": "arvados/jobs"}

        container_request["container_image"] = arv_docker_get_image(
            self.arvrunner.api, docker_req, pull_image,
            self.arvrunner.project_uuid)

        resources = self.builder.resources
        if resources is not None:
            runtime_constraints["vcpus"] = resources.get("cores", 1)
            runtime_constraints["ram"] = resources.get("ram") * 2**20

        api_req, _ = get_feature(self, "http://arvados.org/cwl#APIRequirement")
        if api_req:
            runtime_constraints["API"] = True

        runtime_req, _ = get_feature(
            self, "http://arvados.org/cwl#RuntimeConstraints")
        if runtime_req:
            if "keep_cache" in runtime_req:
                runtime_constraints[
                    "keep_cache_ram"] = runtime_req["keep_cache"] * 2**20

        partition_req, _ = get_feature(
            self, "http://arvados.org/cwl#PartitionRequirement")
        if partition_req:
            scheduling_parameters["partitions"] = aslist(
                partition_req["partition"])

        container_request["mounts"] = mounts
        container_request["runtime_constraints"] = runtime_constraints
        container_request["use_existing"] = kwargs.get("enable_reuse", True)
        container_request["scheduling_parameters"] = scheduling_parameters

        if kwargs.get("runnerjob", "").startswith("arvwf:"):
            wfuuid = kwargs["runnerjob"][6:kwargs["runnerjob"].index("#")]
            wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(
                num_retries=self.arvrunner.num_retries)
            if container_request["name"] == "main":
                container_request["name"] = wfrecord["name"]
            container_request["properties"]["template_uuid"] = wfuuid

        try:
            response = self.arvrunner.api.container_requests().create(
                body=container_request).execute(
                    num_retries=self.arvrunner.num_retries)

            self.uuid = response["uuid"]
            self.arvrunner.processes[self.uuid] = self

            if response["state"] == "Final":
                logger.info("%s reused container %s",
                            self.arvrunner.label(self),
                            response["container_uuid"])
                self.done(response)
            else:
                logger.info("%s %s state is %s", self.arvrunner.label(self),
                            response["uuid"], response["state"])
        except Exception as e:
            logger.error("%s got error %s" %
                         (self.arvrunner.label(self), str(e)))
            self.output_callback({}, "permanentFail")
Ejemplo n.º 12
0
    def arv_executor(self,
                     updated_tool,
                     job_order,
                     runtimeContext,
                     logger=None):
        self.debug = runtimeContext.debug

        workbench1 = self.api.config()["Services"]["Workbench1"]["ExternalURL"]
        workbench2 = self.api.config()["Services"]["Workbench2"]["ExternalURL"]
        controller = self.api.config()["Services"]["Controller"]["ExternalURL"]
        logger.info("Using cluster %s (%s)",
                    self.api.config()["ClusterID"], workbench2 or workbench1
                    or controller)

        updated_tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception(
                "--trash-intermediate is only supported with --api=containers."
            )

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception(
                "--intermediate-output-ttl is only supported with --api=containers."
            )
        if self.intermediate_output_ttl < 0:
            raise Exception(
                "Invalid value %d for --intermediate-output-ttl, cannot be less than zero"
                % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception(
                "--submit-request-uuid requires containers API, but using '{}' api"
                .format(self.work_api))

        default_storage_classes = ",".join([
            k for k, v in self.api.config().get("StorageClasses", {
                "default": {
                    "Default": True
                }
            }).items() if v.get("Default") is True
        ])
        if runtimeContext.storage_classes == "default":
            runtimeContext.storage_classes = default_storage_classes
        if runtimeContext.intermediate_storage_classes == "default":
            runtimeContext.intermediate_storage_classes = default_storage_classes

        if not runtimeContext.name:
            runtimeContext.name = self.name = updated_tool.tool.get(
                "label") or updated_tool.metadata.get(
                    "label") or os.path.basename(updated_tool.tool["id"])

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     updated_tool, job_order)

        # the last clause means: if it is a command line tool, and we
        # are going to wait for the result, and always_submit_runner
        # is false, then we don't submit a runner process.

        submitting = (runtimeContext.update_workflow
                      or runtimeContext.create_workflow or
                      (runtimeContext.submit
                       and not (updated_tool.tool["class"] == "CommandLineTool"
                                and runtimeContext.wait
                                and not runtimeContext.always_submit_runner)))

        loadingContext = self.loadingContext.copy()
        loadingContext.do_validate = False
        if submitting:
            loadingContext.do_update = False
            # Document may have been auto-updated. Reload the original
            # document with updating disabled because we want to
            # submit the document with its original CWL version, not
            # the auto-updated one.
            tool = load_tool(updated_tool.tool["id"], loadingContext)
        else:
            tool = updated_tool

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Recreate process object (ArvadosWorkflow or
        # ArvadosCommandTool) because tool document may have been
        # updated by upload_workflow_deps in ways that modify
        # inheritance of hints or requirements.
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        tool = load_tool(tool.tool, loadingContext)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "containers":
                uuid = upload_workflow(
                    self,
                    tool,
                    job_order,
                    self.project_uuid,
                    uuid=existing_uuid,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    merged_map=merged_map,
                    submit_runner_image=runtimeContext.submit_runner_image)
                self.stdout.write(uuid + "\n")
                return (None, "success")

        self.apply_reqs(job_order, tool)

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception(
                    "--ignore-docker-for-reuse not supported with containers API."
                )
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        if self.should_estimate_cache_size:
            visited = set()
            estimated_size = [0]

            def estimate_collection_cache(obj):
                if obj.get("location", "").startswith("keep:"):
                    m = pdh_size.match(obj["location"][5:])
                    if m and m.group(1) not in visited:
                        visited.add(m.group(1))
                        estimated_size[0] += int(m.group(2))

            visit_class(job_order, ("File", "Directory"),
                        estimate_collection_cache)
            runtimeContext.collection_cache_size = max(
                ((estimated_size[0] * 192) // (1024 * 1024)) + 1, 256)
            self.collection_cache.set_cap(
                runtimeContext.collection_cache_size * 1024 * 1024)

        logger.info("Using collection cache size %s MiB",
                    runtimeContext.collection_cache_size)

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if submitting:
                    tool = RunnerContainer(
                        self,
                        updated_tool,
                        tool,
                        loadingContext,
                        runtimeContext.enable_reuse,
                        self.output_name,
                        self.output_tags,
                        submit_runner_ram=runtimeContext.submit_runner_ram,
                        name=runtimeContext.name,
                        on_error=runtimeContext.on_error,
                        submit_runner_image=runtimeContext.submit_runner_image,
                        intermediate_output_ttl=runtimeContext.
                        intermediate_output_ttl,
                        merged_map=merged_map,
                        priority=runtimeContext.priority,
                        secret_store=self.secret_store,
                        collection_cache_size=runtimeContext.
                        collection_cache_size,
                        collection_cache_is_default=self.
                        should_estimate_cache_size)
                else:
                    runtimeContext.runnerjob = tool.tool["id"]

        if runtimeContext.cwl_runner_job is not None:
            self.uuid = runtimeContext.cwl_runner_job.get('uuid')

        jobiter = tool.job(job_order, self.output_callback, runtimeContext)

        if runtimeContext.submit and not runtimeContext.wait:
            runnerjob = next(jobiter)
            runnerjob.run(runtimeContext)
            self.stdout.write(runnerjob.uuid + "\n")
            return (None, "success")

        current_container = arvados_cwl.util.get_current_container(
            self.api, self.num_retries, logger)
        if current_container:
            logger.info("Running inside container %s",
                        current_container.get("uuid"))

        self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        try:
            self.workflow_eval_lock.acquire()

            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable processes and not waiting on any pending processes."
                        )
                        break

                if self.stop_polling.is_set():
                    break

                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info(
            )[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            elif isinstance(sys.exc_info()[1], WorkflowException):
                logger.error(
                    "Workflow execution failed:\n%s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            else:
                logger.exception("Workflow execution failed")

            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)

            if self.work_api == "containers" and not current_container:
                # Not running in a crunch container, so cancel any outstanding processes.
                for p in self.processes:
                    try:
                        self.api.container_requests().update(
                            uuid=p, body={
                                "priority": "0"
                            }).execute(num_retries=self.num_retries)
                    except Exception:
                        pass
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(tool, Runner):
            logger.info("Final output collection %s", tool.final_output)
            if workbench2 or workbench1:
                logger.info("Output at %scollections/%s", workbench2
                            or workbench1, tool.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(
                    tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = ""
            storage_class_req, _ = tool.get_requirement(
                "http://arvados.org/cwl#OutputStorageClass")
            if storage_class_req and storage_class_req.get(
                    "finalStorageClass"):
                storage_classes = aslist(
                    storage_class_req["finalStorageClass"])
            else:
                storage_classes = runtimeContext.storage_classes.strip().split(
                    ",")

            self.final_output, self.final_output_collection = self.make_output_collection(
                self.output_name, storage_classes, self.output_tags,
                self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output,
                          partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
Ejemplo n.º 13
0
    def run(self, dry_run=False, pull_image=True, **kwargs):
        container_request = {
            "command": self.command_line,
            "owner_uuid": self.arvrunner.project_uuid,
            "name": self.name,
            "output_path": self.outdir,
            "cwd": self.outdir,
            "priority": 1,
            "state": "Committed"
        }
        runtime_constraints = {}
        mounts = {self.outdir: {"kind": "tmp"}}

        dirs = set()
        for f in self.pathmapper.files():
            _, p, tp = self.pathmapper.mapper(f)
            if tp == "Directory" and '/' not in p[6:]:
                mounts[p] = {"kind": "collection", "portable_data_hash": p[6:]}
                dirs.add(p[6:])
        for f in self.pathmapper.files():
            _, p, tp = self.pathmapper.mapper(f)
            if p[6:].split("/")[0] not in dirs:
                mounts[p] = {"kind": "collection", "portable_data_hash": p[6:]}

        if self.generatefiles["listing"]:
            raise UnsupportedRequirement("Generate files not supported")

        container_request["environment"] = {
            "TMPDIR": self.tmpdir,
            "HOME": self.outdir
        }
        if self.environment:
            container_request["environment"].update(self.environment)

        if self.stdin:
            raise UnsupportedRequirement(
                "Stdin redirection currently not suppported")

        if self.stderr:
            raise UnsupportedRequirement(
                "Stderr redirection currently not suppported")

        if self.stdout:
            mounts["stdout"] = {
                "kind": "file",
                "path": "%s/%s" % (self.outdir, self.stdout)
            }

        (docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
        if not docker_req:
            docker_req = {"dockerImageId": "arvados/jobs"}

        container_request["container_image"] = arv_docker_get_image(
            self.arvrunner.api, docker_req, pull_image,
            self.arvrunner.project_uuid)

        resources = self.builder.resources
        if resources is not None:
            runtime_constraints["vcpus"] = resources.get("cores", 1)
            runtime_constraints["ram"] = resources.get("ram") * 2**20

        api_req, _ = get_feature(self, "http://arvados.org/cwl#APIRequirement")
        if api_req:
            runtime_constraints["API"] = True

        runtime_req, _ = get_feature(
            self, "http://arvados.org/cwl#RuntimeConstraints")
        if runtime_req:
            logger.warn(
                "RuntimeConstraints not yet supported by container API")

        partition_req, _ = get_feature(
            self, "http://arvados.org/cwl#PartitionRequirement")
        if partition_req:
            runtime_constraints["partition"] = aslist(
                partition_req["partition"])

        container_request["mounts"] = mounts
        container_request["runtime_constraints"] = runtime_constraints

        try:
            response = self.arvrunner.api.container_requests().create(
                body=container_request).execute(
                    num_retries=self.arvrunner.num_retries)

            self.arvrunner.processes[response["container_uuid"]] = self

            logger.info("Container %s (%s) request state is %s", self.name,
                        response["uuid"], response["state"])

            if response["state"] == "Final":
                self.done(response)
        except Exception as e:
            logger.error("Got error %s" % str(e))
            self.output_callback({}, "permanentFail")
Ejemplo n.º 14
0
    def collect_output(
            self,
            schema,  # type: Dict[Text, Any]
            builder,  # type: Builder
            outdir,  # type: Text
            fs_access,  # type: StdFsAccess
            compute_checksum=True  # type: bool
    ):
        # type: (...) -> Optional[Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]]]
        result = []  # type: List[Any]
        empty_and_optional = False
        debug = LOGGER.isEnabledFor(logging.DEBUG)
        if "outputBinding" in schema:
            binding = schema["outputBinding"]
            globpatterns = []  # type: List[Text]

            revmap = partial(command_line_tool.revmap_file, builder, outdir)

            if "glob" in binding:
                with SourceLine(binding, "glob", WorkflowException, debug):
                    for glob in aslist(binding["glob"]):
                        glob = builder.do_eval(glob)
                        if glob:
                            globpatterns.extend(aslist(glob))

                    for glob in globpatterns:
                        if glob.startswith(outdir):
                            glob = glob[len(outdir) + 1:]
                        elif glob == ".":
                            glob = outdir
                        elif glob.startswith("/"):
                            raise WorkflowException(
                                "glob patterns must not start with '/'")
                        try:
                            prefix = fs_access.glob(outdir)
                            key = cmp_to_key(
                                cast(Callable[[Text, Text], int],
                                     locale.strcoll))

                            # In case of stdout.log or stderr.log file not created
                            if "stdout" in self.tool and "stderr" in self.tool \
                                    and glob in (self.tool["stdout"], self.tool["stderr"]):
                                filepath = Path(fs_access.join(outdir, glob))
                                if not filepath.is_file():
                                    Path(filepath).touch()

                            result.extend([{
                                "location":
                                g,
                                "path":
                                fs_access.join(builder.outdir,
                                               g[len(prefix[0]) + 1:]),
                                "basename":
                                os.path.basename(g),
                                "nameroot":
                                os.path.splitext(os.path.basename(g))[0],
                                "nameext":
                                os.path.splitext(os.path.basename(g))[1],
                                "class":
                                "File" if fs_access.isfile(g) else "Directory"
                            } for g in sorted(fs_access.glob(
                                fs_access.join(outdir, glob)),
                                              key=key)])
                        except (OSError, IOError) as exc:
                            LOGGER.warning(Text(exc))
                        except Exception:
                            LOGGER.exception("Unexpected error from fs_access")
                            raise

                for files in result:
                    rfile = files.copy()
                    # TODO This function raise an exception and seems to be related to docker (which is not used here)
                    # revmap(rfile)
                    if files["class"] == "Directory":
                        load_listing = builder.loadListing or (
                            binding and binding.get("loadListing"))
                        if load_listing and load_listing != "no_listing":
                            get_listing(fs_access, files,
                                        (load_listing == "deep_listing"))
                    else:
                        with fs_access.open(rfile["location"], "rb") as f:
                            contents = b""
                            if binding.get("loadContents") or compute_checksum:
                                contents = f.read(CONTENT_LIMIT)
                            if binding.get("loadContents"):
                                files["contents"] = contents.decode("utf-8")
                            if compute_checksum:
                                checksum = hashlib.sha1()  # nosec: B303
                                while contents != b"":
                                    checksum.update(contents)
                                    contents = f.read(1024 * 1024)
                                files[
                                    "checksum"] = "sha1$%s" % checksum.hexdigest(
                                    )
                            f.seek(0, 2)
                            file_size = f.tell()
                        files["size"] = file_size

            optional = False
            single = False
            if isinstance(schema["type"], list):
                if "null" in schema["type"]:
                    optional = True
                if "File" in schema["type"] or "Directory" in schema["type"]:
                    single = True
            elif schema["type"] == "File" or schema["type"] == "Directory":
                single = True

            if "outputEval" in binding:
                with SourceLine(binding, "outputEval", WorkflowException,
                                debug):
                    result = builder.do_eval(binding["outputEval"],
                                             context=result)

            if single:
                if not result and not optional:
                    with SourceLine(binding, "glob", WorkflowException, debug):
                        raise WorkflowException(
                            "Did not find output file with glob pattern: '{}'".
                            format(globpatterns))
                elif not result and optional:
                    pass
                elif isinstance(result, list):
                    if len(result) > 1:
                        raise WorkflowException(
                            "Multiple matches for output item that is a single file."
                        )
                    result = result[0]

            if "secondaryFiles" in schema:
                with SourceLine(schema, "secondaryFiles", WorkflowException,
                                debug):
                    for primary in aslist(result):
                        if isinstance(primary, dict):
                            primary.setdefault("secondaryFiles", [])
                            pathprefix = primary["path"][0:primary["path"].
                                                         rindex("/") + 1]
                            for file in aslist(schema["secondaryFiles"]):
                                if isinstance(
                                        file,
                                        dict) or "$(" in file or "${" in file:
                                    sfpath = builder.do_eval(file,
                                                             context=primary)
                                    subst = False
                                else:
                                    sfpath = file
                                    subst = True
                                for sfitem in aslist(sfpath):
                                    if isinstance(sfitem, str):
                                        if subst:
                                            sfitem = {
                                                "path":
                                                substitute(
                                                    primary["path"], sfitem)
                                            }
                                        else:
                                            sfitem = {
                                                "path": pathprefix + sfitem
                                            }
                                    if "path" in sfitem and "location" not in sfitem:
                                        revmap(sfitem)
                                    if fs_access.isfile(sfitem["location"]):
                                        sfitem["class"] = "File"
                                        primary["secondaryFiles"].append(
                                            sfitem)
                                    elif fs_access.isdir(sfitem["location"]):
                                        sfitem["class"] = "Directory"
                                        primary["secondaryFiles"].append(
                                            sfitem)

            if "format" in schema:
                for primary in aslist(result):
                    primary["format"] = builder.do_eval(schema["format"],
                                                        context=primary)

            # Ensure files point to local references outside of the run environment
            # TODO: Again removing revmap....
            # adjustFileObjs(result, revmap)

            if not result and optional:
                return None

        if not empty_and_optional and isinstance(
                schema["type"], dict) and schema["type"]["type"] == "record":
            out = {}
            for f in schema["type"]["fields"]:
                out[shortname(
                    f["name"])] = self.collect_output(  # type: ignore
                        f,
                        builder,
                        outdir,
                        fs_access,
                        compute_checksum=compute_checksum)
            return out
        return result
Ejemplo n.º 15
0
    def collect_output(
            self,
            schema,  # type: Dict[Text, Any]
            builder,  # type: Builder
            outdir,  # type: Text
            fs_access,  # type: StdFsAccess
            compute_checksum=True  # type: bool
    ):
        # type: (...) -> Optional[Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]]]
        """
        Collect outputs from the step :term:`Process` following its execution.

        .. note:
            When :term:`CWL` runner tries to forward ``step(i) outputs -> step(i+1) inputs``
            using :meth:`collect_outputs`, it expects exact ``outputBindings`` locations to be matched.
            In other words, a definition like ``outputBindings: {glob: outputs/*.txt}`` will generate results located
            in ``step(i)`` as ``"<tmp-workdir>/outputs/file.txt"`` and ``step(i+1)`` will look explicitly
            in ``"<tmp-workdir>/outputs`` using the ``glob`` pattern. Because each of our :term:`Process` in
            the workflow are distinct/remote entities, each one stages its outputs at different URL locations,
            not sharing the same *root directory*. When we stage intermediate results locally, the sub-dirs are lost.
            Therefore, they act like individual :term:`CWL` runner calls where the *final results* are moved back
            to the local directory for convenient access, but our *local directory* is the URL WPS-outputs location.
            To let :term:`CWL` :term:`Workflow` inter-steps mapping work as intended, we must remap the locations
            ignoring any nested dirs where the modified *outputBindings* definition will be able to match as if each
            step :term:`Process` outputs were generated locally.
        """
        result = []  # type: List[Any]
        empty_and_optional = False
        debug = LOGGER.isEnabledFor(logging.DEBUG)
        if "outputBinding" in schema:
            binding = schema["outputBinding"]
            globpatterns = []  # type: List[Text]

            revmap = partial(command_line_tool.revmap_file, builder, outdir)

            if "glob" in binding:
                with SourceLine(binding, "glob", WorkflowException, debug):
                    for glob in aslist(binding["glob"]):
                        glob = builder.do_eval(glob)
                        if glob:
                            globpatterns.extend(aslist(glob))

                    # rebase glob pattern as applicable (see note)
                    for glob in list(globpatterns):
                        if not any(
                                glob.startswith(part)
                                for part in [".", "/", "~"]) and "/" in glob:
                            glob = builder.do_eval(glob.split("/")[-1])
                            if glob:
                                globpatterns.extend(aslist(glob))

                    for glob in globpatterns:
                        if glob.startswith(outdir):
                            glob = glob[len(outdir) + 1:]
                        elif glob == ".":
                            glob = outdir
                        elif glob.startswith("/"):
                            raise WorkflowException(
                                "glob patterns must not start with '/'")
                        try:
                            prefix = fs_access.glob(outdir)
                            key = cmp_to_key(
                                cast(Callable[[Text, Text], int],
                                     locale.strcoll))

                            # In case of stdout.log or stderr.log file not created
                            if "stdout" in self.tool and "stderr" in self.tool \
                                    and glob in (self.tool["stdout"], self.tool["stderr"]):
                                filepath = Path(fs_access.join(outdir, glob))
                                if not filepath.is_file():
                                    Path(filepath).touch()

                            result.extend([{
                                "location":
                                g,
                                "path":
                                fs_access.join(builder.outdir,
                                               g[len(prefix[0]) + 1:]),
                                "basename":
                                os.path.basename(g),
                                "nameroot":
                                os.path.splitext(os.path.basename(g))[0],
                                "nameext":
                                os.path.splitext(os.path.basename(g))[1],
                                "class":
                                "File" if fs_access.isfile(g) else "Directory"
                            } for g in sorted(fs_access.glob(
                                fs_access.join(outdir, glob)),
                                              key=key)])
                        except (OSError, IOError) as exc:
                            LOGGER.warning(Text(exc))
                        except Exception:
                            LOGGER.exception("Unexpected error from fs_access")
                            raise

                for files in result:
                    rfile = files.copy()
                    # TODO This function raise an exception and seems to be related to docker (which is not used here)
                    # revmap(rfile)
                    if files["class"] == "Directory":
                        load_listing = builder.loadListing or (
                            binding and binding.get("loadListing"))
                        if load_listing and load_listing != "no_listing":
                            get_listing(fs_access, files,
                                        (load_listing == "deep_listing"))
                    else:
                        with fs_access.open(rfile["location"], "rb") as f:
                            contents = b""
                            if binding.get("loadContents") or compute_checksum:
                                contents = f.read(CONTENT_LIMIT)
                            if binding.get("loadContents"):
                                files["contents"] = contents.decode("utf-8")
                            if compute_checksum:
                                checksum = hashlib.sha1()  # nosec: B303
                                while contents != b"":
                                    checksum.update(contents)
                                    contents = f.read(1024 * 1024)
                                files[
                                    "checksum"] = f"sha1${checksum.hexdigest()}"
                            f.seek(0, 2)
                            file_size = f.tell()
                        files["size"] = file_size

            optional = False
            single = False
            if isinstance(schema["type"], list):
                if "null" in schema["type"]:
                    optional = True
                if "File" in schema["type"] or "Directory" in schema["type"]:
                    single = True
            elif schema["type"] == "File" or schema["type"] == "Directory":
                single = True

            if "outputEval" in binding:
                with SourceLine(binding, "outputEval", WorkflowException,
                                debug):
                    result = builder.do_eval(binding["outputEval"],
                                             context=result)

            if single:
                if not result and not optional:
                    with SourceLine(binding, "glob", WorkflowException, debug):
                        raise WorkflowException(
                            f"Did not find output file with glob pattern: '{globpatterns}'"
                        )
                elif not result and optional:
                    pass
                elif isinstance(result, list):
                    if len(result) > 1:
                        raise WorkflowException(
                            "Multiple matches for output item that is a single file."
                        )
                    result = result[0]

            if "secondaryFiles" in schema:
                with SourceLine(schema, "secondaryFiles", WorkflowException,
                                debug):
                    for primary in aslist(result):
                        if isinstance(primary, dict):
                            primary.setdefault("secondaryFiles", [])
                            pathprefix = primary["path"][0:primary["path"].
                                                         rindex("/") + 1]
                            for file in aslist(schema["secondaryFiles"]):
                                if isinstance(
                                        file,
                                        dict) or "$(" in file or "${" in file:
                                    sfpath = builder.do_eval(file,
                                                             context=primary)
                                    subst = False
                                else:
                                    sfpath = file
                                    subst = True
                                for sfitem in aslist(sfpath):
                                    if isinstance(sfitem, str):
                                        if subst:
                                            sfitem = {
                                                "path":
                                                substitute(
                                                    primary["path"], sfitem)
                                            }
                                        else:
                                            sfitem = {
                                                "path": pathprefix + sfitem
                                            }
                                    if "path" in sfitem and "location" not in sfitem:
                                        revmap(sfitem)
                                    if fs_access.isfile(sfitem["location"]):
                                        sfitem["class"] = "File"
                                        primary["secondaryFiles"].append(
                                            sfitem)
                                    elif fs_access.isdir(sfitem["location"]):
                                        sfitem["class"] = "Directory"
                                        primary["secondaryFiles"].append(
                                            sfitem)

            if "format" in schema:
                for primary in aslist(result):
                    primary["format"] = builder.do_eval(schema["format"],
                                                        context=primary)

            # Ensure files point to local references outside of the run environment
            # TODO: Again removing revmap....
            # adjustFileObjs(result, revmap)

            if not result and optional:
                return None

        if not empty_and_optional and isinstance(
                schema["type"], dict) and schema["type"]["type"] == "record":
            out = {}
            for f in schema["type"]["fields"]:
                out[shortname(
                    f["name"])] = self.collect_output(  # type: ignore
                        f,
                        builder,
                        outdir,
                        fs_access,
                        compute_checksum=compute_checksum)
            return out
        return result
Ejemplo n.º 16
0
def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary,
                  discovered):
    if isinstance(inputschema,
                  Sequence) and not isinstance(inputschema, basestring):
        # union type, collect all possible secondaryFiles
        for i in inputschema:
            set_secondary(fsaccess, builder, i, secondaryspec, primary,
                          discovered)
        return

    if isinstance(inputschema, basestring):
        sd = search_schemadef(inputschema,
                              reversed(builder.hints + builder.requirements))
        if sd:
            inputschema = sd
        else:
            return

    if "secondaryFiles" in inputschema:
        # set secondaryFiles, may be inherited by compound types.
        secondaryspec = inputschema["secondaryFiles"]

    if (isinstance(inputschema["type"], (Mapping, Sequence))
            and not isinstance(inputschema["type"], basestring)):
        # compound type (union, array, record)
        set_secondary(fsaccess, builder, inputschema["type"], secondaryspec,
                      primary, discovered)

    elif (inputschema["type"] == "record" and isinstance(primary, Mapping)):
        #
        # record type, find secondary files associated with fields.
        #
        for f in inputschema["fields"]:
            p = primary.get(shortname(f["name"]))
            if p:
                set_secondary(fsaccess, builder, f, secondaryspec, p,
                              discovered)

    elif (inputschema["type"] == "array" and isinstance(primary, Sequence)):
        #
        # array type, find secondary files of elements
        #
        for p in primary:
            set_secondary(fsaccess, builder, {"type": inputschema["items"]},
                          secondaryspec, p, discovered)

    elif (inputschema["type"] == "File" and secondaryspec
          and isinstance(primary, Mapping) and primary.get("class") == "File"
          and "secondaryFiles" not in primary):
        #
        # Found a file, check for secondaryFiles
        #
        specs = []
        primary["secondaryFiles"] = secondaryspec
        for i, sf in enumerate(aslist(secondaryspec)):
            if builder.cwlVersion == "v1.0":
                pattern = builder.do_eval(sf, context=primary)
            else:
                pattern = builder.do_eval(sf["pattern"], context=primary)
            if pattern is None:
                continue
            if isinstance(pattern, list):
                specs.extend(pattern)
            elif isinstance(pattern, dict):
                specs.append(pattern)
            elif isinstance(pattern, str):
                if builder.cwlVersion == "v1.0":
                    specs.append({"pattern": pattern, "required": True})
                else:
                    specs.append({
                        "pattern": pattern,
                        "required": sf.get("required")
                    })
            else:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Expression must return list, object, string or null")

        found = []
        for i, sf in enumerate(specs):
            if isinstance(sf, dict):
                if sf.get("class") == "File":
                    pattern = None
                    if sf.get("location") is None:
                        raise SourceLine(
                            primary["secondaryFiles"], i,
                            validate.ValidationException).makeError(
                                "File object is missing 'location': %s" % sf)
                    sfpath = sf["location"]
                    required = True
                else:
                    pattern = sf["pattern"]
                    required = sf.get("required")
            elif isinstance(sf, str):
                pattern = sf
                required = True
            else:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Expression must return list, object, string or null")

            if pattern is not None:
                sfpath = substitute(primary["location"], pattern)

            required = builder.do_eval(required, context=primary)

            if fsaccess.exists(sfpath):
                if pattern is not None:
                    found.append({"location": sfpath, "class": "File"})
                else:
                    found.append(sf)
            elif required:
                raise SourceLine(
                    primary["secondaryFiles"], i,
                    validate.ValidationException).makeError(
                        "Required secondary file '%s' does not exist" % sfpath)

        primary["secondaryFiles"] = cmap(found)
        if discovered is not None:
            discovered[primary["location"]] = primary["secondaryFiles"]
    elif inputschema["type"] not in primitive_types_set:
        set_secondary(fsaccess, builder, inputschema["type"], secondaryspec,
                      primary, discovered)
Ejemplo n.º 17
0
    def run(self, runtimeContext):
        # ArvadosCommandTool subclasses from cwltool.CommandLineTool,
        # which calls makeJobRunner() to get a new ArvadosContainer
        # object.  The fields that define execution such as
        # command_line, environment, etc are set on the
        # ArvadosContainer object by CommandLineTool.job() before
        # run() is called.

        runtimeContext = self.job_runtime

        container_request = {
            "command": self.command_line,
            "name": self.name,
            "output_path": self.outdir,
            "cwd": self.outdir,
            "priority": runtimeContext.priority,
            "state": "Committed",
            "properties": {},
        }
        runtime_constraints = {}

        if runtimeContext.project_uuid:
            container_request["owner_uuid"] = runtimeContext.project_uuid

        if self.arvrunner.secret_store.has_secret(self.command_line):
            raise WorkflowException("Secret material leaked on command line, only file literals may contain secrets")

        if self.arvrunner.secret_store.has_secret(self.environment):
            raise WorkflowException("Secret material leaked in environment, only file literals may contain secrets")

        resources = self.builder.resources
        if resources is not None:
            runtime_constraints["vcpus"] = math.ceil(resources.get("cores", 1))
            runtime_constraints["ram"] = math.ceil(resources.get("ram") * 2**20)

        mounts = {
            self.outdir: {
                "kind": "tmp",
                "capacity": math.ceil(resources.get("outdirSize", 0) * 2**20)
            },
            self.tmpdir: {
                "kind": "tmp",
                "capacity": math.ceil(resources.get("tmpdirSize", 0) * 2**20)
            }
        }
        secret_mounts = {}
        scheduling_parameters = {}

        rf = [self.pathmapper.mapper(f) for f in self.pathmapper.referenced_files]
        rf.sort(key=lambda k: k.resolved)
        prevdir = None
        for resolved, target, tp, stg in rf:
            if not stg:
                continue
            if prevdir and target.startswith(prevdir):
                continue
            if tp == "Directory":
                targetdir = target
            else:
                targetdir = os.path.dirname(target)
            sp = resolved.split("/", 1)
            pdh = sp[0][5:]   # remove "keep:"
            mounts[targetdir] = {
                "kind": "collection",
                "portable_data_hash": pdh
            }
            if pdh in self.pathmapper.pdh_to_uuid:
                mounts[targetdir]["uuid"] = self.pathmapper.pdh_to_uuid[pdh]
            if len(sp) == 2:
                if tp == "Directory":
                    path = sp[1]
                else:
                    path = os.path.dirname(sp[1])
                if path and path != "/":
                    mounts[targetdir]["path"] = path
            prevdir = targetdir + "/"

        with Perf(metrics, "generatefiles %s" % self.name):
            if self.generatefiles["listing"]:
                vwd = arvados.collection.Collection(api_client=self.arvrunner.api,
                                                    keep_client=self.arvrunner.keep_client,
                                                    num_retries=self.arvrunner.num_retries)
                generatemapper = NoFollowPathMapper(self.generatefiles["listing"], "", "",
                                                    separateDirs=False)

                sorteditems = sorted(generatemapper.items(), key=lambda n: n[1].target)

                logger.debug("generatemapper is %s", sorteditems)

                with Perf(metrics, "createfiles %s" % self.name):
                    for f, p in sorteditems:
                        if not p.target:
                            pass
                        elif p.type in ("File", "Directory", "WritableFile", "WritableDirectory"):
                            if p.resolved.startswith("_:"):
                                vwd.mkdirs(p.target)
                            else:
                                source, path = self.arvrunner.fs_access.get_collection(p.resolved)
                                vwd.copy(path or ".", p.target, source_collection=source)
                        elif p.type == "CreateFile":
                            if self.arvrunner.secret_store.has_secret(p.resolved):
                                secret_mounts["%s/%s" % (self.outdir, p.target)] = {
                                    "kind": "text",
                                    "content": self.arvrunner.secret_store.retrieve(p.resolved)
                                }
                            else:
                                with vwd.open(p.target, "w") as n:
                                    n.write(p.resolved)

                def keepemptydirs(p):
                    if isinstance(p, arvados.collection.RichCollectionBase):
                        if len(p) == 0:
                            p.open(".keep", "w").close()
                        else:
                            for c in p:
                                keepemptydirs(p[c])

                keepemptydirs(vwd)

                if not runtimeContext.current_container:
                    runtimeContext.current_container = arvados_cwl.util.get_current_container(self.arvrunner.api, self.arvrunner.num_retries, logger)
                info = arvados_cwl.util.get_intermediate_collection_info(self.name, runtimeContext.current_container, runtimeContext.intermediate_output_ttl)
                vwd.save_new(name=info["name"],
                             owner_uuid=runtimeContext.project_uuid,
                             ensure_unique_name=True,
                             trash_at=info["trash_at"],
                             properties=info["properties"])

                prev = None
                for f, p in sorteditems:
                    if (not p.target or self.arvrunner.secret_store.has_secret(p.resolved) or
                        (prev is not None and p.target.startswith(prev))):
                        continue
                    mountpoint = "%s/%s" % (self.outdir, p.target)
                    mounts[mountpoint] = {"kind": "collection",
                                          "portable_data_hash": vwd.portable_data_hash(),
                                          "path": p.target}
                    if p.type.startswith("Writable"):
                        mounts[mountpoint]["writable"] = True
                    prev = p.target + "/"

        container_request["environment"] = {"TMPDIR": self.tmpdir, "HOME": self.outdir}
        if self.environment:
            container_request["environment"].update(self.environment)

        if self.stdin:
            sp = self.stdin[6:].split("/", 1)
            mounts["stdin"] = {"kind": "collection",
                                "portable_data_hash": sp[0],
                                "path": sp[1]}

        if self.stderr:
            mounts["stderr"] = {"kind": "file",
                                "path": "%s/%s" % (self.outdir, self.stderr)}

        if self.stdout:
            mounts["stdout"] = {"kind": "file",
                                "path": "%s/%s" % (self.outdir, self.stdout)}

        (docker_req, docker_is_req) = self.get_requirement("DockerRequirement")
        if not docker_req:
            docker_req = {"dockerImageId": "arvados/jobs"}

        container_request["container_image"] = arv_docker_get_image(self.arvrunner.api,
                                                                    docker_req,
                                                                    runtimeContext.pull_image,
                                                                    runtimeContext.project_uuid)

        api_req, _ = self.get_requirement("http://arvados.org/cwl#APIRequirement")
        if api_req:
            runtime_constraints["API"] = True

        runtime_req, _ = self.get_requirement("http://arvados.org/cwl#RuntimeConstraints")
        if runtime_req:
            if "keep_cache" in runtime_req:
                runtime_constraints["keep_cache_ram"] = math.ceil(runtime_req["keep_cache"] * 2**20)
            if "outputDirType" in runtime_req:
                if runtime_req["outputDirType"] == "local_output_dir":
                    # Currently the default behavior.
                    pass
                elif runtime_req["outputDirType"] == "keep_output_dir":
                    mounts[self.outdir]= {
                        "kind": "collection",
                        "writable": True
                    }

        partition_req, _ = self.get_requirement("http://arvados.org/cwl#PartitionRequirement")
        if partition_req:
            scheduling_parameters["partitions"] = aslist(partition_req["partition"])

        intermediate_output_req, _ = self.get_requirement("http://arvados.org/cwl#IntermediateOutput")
        if intermediate_output_req:
            self.output_ttl = intermediate_output_req["outputTTL"]
        else:
            self.output_ttl = self.arvrunner.intermediate_output_ttl

        if self.output_ttl < 0:
            raise WorkflowException("Invalid value %d for output_ttl, cannot be less than zero" % container_request["output_ttl"])

        if self.timelimit is not None:
            scheduling_parameters["max_run_time"] = self.timelimit

        extra_submit_params = {}
        if runtimeContext.submit_runner_cluster:
            extra_submit_params["cluster_id"] = runtimeContext.submit_runner_cluster

        container_request["output_name"] = "Output for step %s" % (self.name)
        container_request["output_ttl"] = self.output_ttl
        container_request["mounts"] = mounts
        container_request["secret_mounts"] = secret_mounts
        container_request["runtime_constraints"] = runtime_constraints
        container_request["scheduling_parameters"] = scheduling_parameters

        enable_reuse = runtimeContext.enable_reuse
        if enable_reuse:
            reuse_req, _ = self.get_requirement("http://arvados.org/cwl#ReuseRequirement")
            if reuse_req:
                enable_reuse = reuse_req["enableReuse"]
        container_request["use_existing"] = enable_reuse

        if runtimeContext.runnerjob.startswith("arvwf:"):
            wfuuid = runtimeContext.runnerjob[6:runtimeContext.runnerjob.index("#")]
            wfrecord = self.arvrunner.api.workflows().get(uuid=wfuuid).execute(num_retries=self.arvrunner.num_retries)
            if container_request["name"] == "main":
                container_request["name"] = wfrecord["name"]
            container_request["properties"]["template_uuid"] = wfuuid

        self.output_callback = self.arvrunner.get_wrapped_callback(self.output_callback)

        try:
            if runtimeContext.submit_request_uuid:
                response = self.arvrunner.api.container_requests().update(
                    uuid=runtimeContext.submit_request_uuid,
                    body=container_request,
                    **extra_submit_params
                ).execute(num_retries=self.arvrunner.num_retries)
            else:
                response = self.arvrunner.api.container_requests().create(
                    body=container_request,
                    **extra_submit_params
                ).execute(num_retries=self.arvrunner.num_retries)

            self.uuid = response["uuid"]
            self.arvrunner.process_submitted(self)

            if response["state"] == "Final":
                logger.info("%s reused container %s", self.arvrunner.label(self), response["container_uuid"])
            else:
                logger.info("%s %s state is %s", self.arvrunner.label(self), response["uuid"], response["state"])
        except Exception:
            logger.exception("%s got an error", self.arvrunner.label(self))
            self.output_callback({}, "permanentFail")
Ejemplo n.º 18
0
    def run(self, fileStore):
        cwljob = resolve_indirect(self.cwljob)

        # `promises` dict
        # from: each parameter (workflow input or step output)
        #   that may be used as a "source" for a step input workflow output
        #   parameter
        # to: the job that will produce that value.
        promises = {}

        # `jobs` dict from step id to job that implements that step.
        jobs = {}

        for inp in self.cwlwf.tool["inputs"]:
            promises[inp["id"]] = SelfJob(self, cwljob)

        alloutputs_fufilled = False
        while not alloutputs_fufilled:
            # Iteratively go over the workflow steps, scheduling jobs as their
            # dependencies can be fufilled by upstream workflow inputs or
            # step outputs.  Loop exits when the workflow outputs
            # are satisfied.

            alloutputs_fufilled = True

            for step in self.cwlwf.steps:
                if step.tool["id"] not in jobs:
                    stepinputs_fufilled = True
                    for inp in step.tool["inputs"]:
                        if "source" in inp:
                            for s in aslist(inp["source"]):
                                if s not in promises:
                                    stepinputs_fufilled = False
                    if stepinputs_fufilled:
                        jobobj = {}

                        for inp in step.tool["inputs"]:
                            key = shortname(inp["id"])
                            if "source" in inp:
                                if inp.get("linkMerge") or len(aslist(inp["source"])) > 1:
                                    linkMerge = inp.get("linkMerge", "merge_nested")
                                    if linkMerge == "merge_nested":
                                        jobobj[key] = (
                                            MergeInputsNested([(shortname(s), promises[s].rv())
                                                               for s in aslist(inp["source"])]))
                                    elif linkMerge == "merge_flattened":
                                        jobobj[key] = (
                                            MergeInputsFlattened([(shortname(s), promises[s].rv())
                                                                  for s in aslist(inp["source"])]))
                                    else:
                                        raise validate.ValidationException(
                                            "Unsupported linkMerge '%s'", linkMerge)
                                else:
                                    jobobj[key] = (
                                    shortname(inp["source"]), promises[inp["source"]].rv())
                            elif "default" in inp:
                                d = copy.copy(inp["default"])
                                jobobj[key] = ("default", {"default": d})

                            if "valueFrom" in inp and "scatter" not in step.tool:
                                if key in jobobj:
                                    jobobj[key] = StepValueFrom(inp["valueFrom"],
                                                                jobobj[key],
                                                                self.cwlwf.requirements)
                                else:
                                    jobobj[key] = StepValueFrom(inp["valueFrom"],
                                                                ("None", {"None": None}),
                                                                self.cwlwf.requirements)

                        if "scatter" in step.tool:
                            wfjob = CWLScatter(step, IndirectDict(jobobj), **self.executor_options)
                            followOn = CWLGather(step, wfjob.rv())
                            wfjob.addFollowOn(followOn)
                        else:
                            (wfjob, followOn) = makeJob(step.embedded_tool, IndirectDict(jobobj),
                                                        **self.executor_options)

                        jobs[step.tool["id"]] = followOn

                        connected = False
                        for inp in step.tool["inputs"]:
                            for s in aslist(inp.get("source", [])):
                                if not promises[s].hasChild(wfjob):
                                    promises[s].addChild(wfjob)
                                    connected = True
                        if not connected:
                            # workflow step has default inputs only, isn't connected to other jobs,
                            # so add it as child of workflow.
                            self.addChild(wfjob)

                        for out in step.tool["outputs"]:
                            promises[out["id"]] = followOn

                for inp in step.tool["inputs"]:
                    for s in aslist(inp.get("source", [])):
                        if s not in promises:
                            alloutputs_fufilled = False

            # may need a test
            for out in self.cwlwf.tool["outputs"]:
                if "source" in out:
                    if out["source"] not in promises:
                        alloutputs_fufilled = False

        outobj = {}
        for out in self.cwlwf.tool["outputs"]:
            outobj[shortname(out["id"])] = (shortname(out["outputSource"]), promises[out["outputSource"]].rv())

        return IndirectDict(outobj)